def __init__(self, hparams: Hparams, **kwargs): super(ErnieEmbedding, self).__init__(**kwargs) self.vocab_size = hparams.vocab_size self.hidden_size = hparams.hidden_size self.initializer_range = hparams.initializer_range self.use_task_id = hparams.use_task_id self.position_embeddings = tf.keras.layers.Embedding( hparams.max_position_embeddings, hparams.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="position_embeddings") self.token_type_embeddings = tf.keras.layers.Embedding( hparams.get("type_vocab_size", hparams.get("sent_type_vocab_size")), hparams.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="token_type_embeddings") if self.use_task_id: self.task_embeddings = tf.keras.layers.Embedding( hparams.task_type_vocab_size, hparams.hidden_size, embeddings_initializer=get_initializer(self.initializer_range), name="task_type_embeddings") self.layer_norm = tf.keras.layers.LayerNormalization( epsilon=hparams.layer_norm_eps, name="LayerNorm") self.dropout = tf.keras.layers.Dropout(hparams.hidden_dropout_prob)
def deploy(hparams: Hparams): logger = logging.getLogger(__name__) assert hparams.model_resume_path is not None, ValueError("Model resume path is None, must be specified.") # reuse hparams model_resume_path = hparams.model_resume_path logger.info(f"Reuse saved json config from {os.path.join(hparams.get_workspace_dir(), 'hparams.json')}") hparams.reuse_saved_json_hparam() hparams.cascade_set("model_resume_path", model_resume_path) # build model (model,) = build_model(hparams, return_losses=False, return_metrics=False, return_optimizer=False) logger.info("Export model to deployment.") saved_path = model.deploy() logger.info(f"Save bento Service in {saved_path}")
def test_process(self): hparam = Hparams() hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/idiom_generator.yml') hparam.stand_by() hparam.cascade_set("model_load_path", "/search/odin/yyk/workspace/AiSpace/save/test_bert_for_text_generation_idiom__idiom_generator_119_23") model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) tokenizer = CPMTokenizer(hparam.dataset.tokenizer) input = "春眠不觉晓" input_tokens = tokenizer.tokenize(input) + [tokenizer.vocab.sep_token] input_encoded = tokenizer.encode(input_tokens) input_ids = tf.constant([input_encoded['input_ids']], dtype=tf.int32) attention_mask = tf.constant([[1] * len(input_encoded['input_ids'])], dtype=tf.int32) input_dict = { "input_ids": input_ids, "attention_mask": attention_mask } # output = model(input_dict) output = model.generate(input_ids, **hparam.generation_attributes) print(input_encoded) output = tokenizer.decode(output.numpy().reshape([-1]).tolist()) print(output)
def build_callbacks(hparams: Hparams): from aispace.layers.callbacks import CALLBACKS callback_hparam = hparams.training.callbacks callbacks = [] for name, config in sorted(callback_hparam.items(), key=lambda s: s[1]['priority'], reverse=True): if not config.switch: continue fn = CALLBACKS.get(name) logger.info(f"Using callback [{name}].") if fn is None: logger.warning(f"Callback name {name} may be wrong.") continue if name.startswith('evaluator'): logger.info( "Build validation and test dataset for evaluator callback.") dev_dataset, test_dataset = next( load_dataset(hparams, ret_train=False, ret_info=False))[:2] config.config['validation_dataset'] = dev_dataset config.config[ 'validation_steps'] = hparams.training.validation_steps config.config['test_dataset'] = test_dataset config.config['test_steps'] = hparams.training.test_steps config.config['report_dir'] = hparams.get_report_dir() callbacks.append(fn(config.config)) return callbacks
def __init__(self, config: Hparams, **kwargs): super().__init__(config, **kwargs) config = config.config self.config = config self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.use_cache = config.use_cache self.return_dict = config.use_return_dict self.num_hidden_layers = config.n_layer self.vocab_size = config.vocab_size self.n_embd = config.n_embd self.wte = SharedEmbeddings( config.vocab_size, config.hidden_size, initializer_range=config.initializer_range, name="wte" ) self.wpe = tf.keras.layers.Embedding( config.n_positions, config.n_embd, embeddings_initializer=get_initializer(config.initializer_range), name="wpe", ) self.drop = tf.keras.layers.Dropout(config.embd_pdrop) if config.has_key("layers"): self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.layers.start, config.layers.end, config.layers.step)] self.num_hidden_layers = len(self.h) else: self.h = [TFBlock(config.n_ctx, config, scale=True, name="h_._{}".format(i)) for i in range(config.n_layer)] self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f") self.pooler = SequenceSummary(config, name="pooler")
def avg_checkpints(hparams: Hparams): logger = logging.getLogger(__name__) (model,) = build_model(hparams, return_losses=False, return_metrics=False, return_optimizer=False) logger.info(f"Average checkpoints from {hparams.prefix_or_checkpints}") average_checkpoints(model, hparams.prefix_or_checkpints, hparams.num_last_checkpoints, hparams.ckpt_weights) evaluation(hparams, model=model) logger.info(f"Save model in {hparams.get_model_filename()}") model.save_weights(hparams.get_model_filename(), save_format="tf")
def test_gpt2_checkpoint(self): hparam = Hparams() hparam.load_from_config_file('/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml') # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml') # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml') hparam.stand_by() model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2" # model1 = tf.keras.models.load_model(model_path) # model_gold = model1.trainable_variables # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1] # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1] model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) # model_vars = model.trainable_variables for itm in model_vars: print(f"{itm.name}, {itm.shape}") # print(itm.numpy()) # print(type(itm.numpy())) # break print()
def test_eval(self): hparams = Hparams() hparams.load_from_config_file("../../configs/glue_zh/tnews_k_fold.yml") hparams.stand_by() ckpts = [ "../../save/test_textcnn_for_classification_119_14/k_fold/1/model_saved/model", "../../save/test_textcnn_for_classification_119_14/k_fold/2/model_saved/model", ] evaluation(hparams, checkpoints=ckpts)
def test_process(self): hparam = Hparams() hparam.load_from_config_file( '/search/odin/yyk/workspace/AiSpace/configs/custom/test_gpt2.yml') hparam.stand_by() model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) model_vars = model.trainable_variables model_path = "/search/odin/yyk/data/pretrained/gpt/cpm-lm-tf2_v2" tf_huggingface_gpt2_adapter(model_vars, model_path)
def test_init(self): hparams = Hparams() hparams.load_from_config_file("../../../configs/custom/test_gpt2.yml") hparams.stand_by() tokenizer = CPMTokenizer(hparams.dataset.tokenizer) a = "这两天,XLNet貌似也引起了NLP圈的极大关注,从实验数据看,在某些场景下,确实XLNet相对Bert有很大幅度的提升。" b = "就像我们之前说的,感觉Bert打开两阶段模式的魔法盒开关后,在这条路上,会有越来越多的同行者,而XLNet就是其中比较引人注目的一位" res = tokenizer.encode(a, b) print(res)
def test_electra_checkpoint(self): hparam = Hparams() hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml') hparam.stand_by() # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints" # ckpt_vars = [itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1] ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1 and not itm[0].endswith("lamb_m") and not itm[0].endswith("lamb_v")] model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) model_vars = model.trainable_variables print()
def test_lstc_load(self): hparams = Hparams() hparams.load_from_config_file("../configs/custom/test_gpt2.yml") hparams.stand_by() checksum_dir = "../aispace/datasets/url_checksums" tfds.download.add_checksums_dir(checksum_dir) # download_config = DownloadConfig(register_checksums=True) tnews = tfds.load( "idiom/idiom_generator", # data_dir="/search/data1/yyk/data/datasets/glue_zh", split="train[90%:]", data_dir="../data", builder_kwargs={'hparams': hparams}, # download_and_prepare_kwargs={'download_config': download_config} ) tokenizer = BertTokenizer(hparams.dataset.tokenizer) # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()} label_counter = {} i = 0 for itm in tnews: # for k, v in itm.items(): # if v.shape[0] == 151: # print(itm) # break # print(itm) # print() # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()])) # break i += 1 # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()] # print(id_to_label[l]) # if id_to_label[l] not in label_counter: # label_counter[id_to_label[l]] = 0 # label_counter[id_to_label[l]] += 1 # print(label_counter) # print(len(label_counter)) print(i) # python -u aispace/trainer.py \ # --experiment_name test \ # --model_name bert_for_classification \ # --schedule train_and_eval \ # --config_name tnews \ # --config_dir ./configs/glue_zh \ # --gpus 0 1 2 3
def test_lstc_load(self): hparams = Hparams() hparams.load_from_config_file( "../configs/2020_LSTC/DuEE_keyphrase.yml") hparams.stand_by() checksum_dir = "../aispace/datasets/url_checksums" tfds.download.add_checksums_dir(checksum_dir) # download_config = DownloadConfig(register_checksums=True) tnews = tfds.load( "lstc_2020/DuEE_role", # data_dir="/search/data1/yyk/data/datasets/glue_zh", data_dir="../data", builder_kwargs={'hparams': hparams}, # download_and_prepare_kwargs={'download_config': download_config} ) # tokenizer = BertTokenizer(hparams.dataset.tokenizer) # s = "BCI下架新疆棉花产品" # res = tokenizer.tokenize(s, True) # print(res) # id_to_label = {v: k for k, v in hparams.duee_event_type_labels.items()} label_counter = {} for itm in tnews["train"]: # for k, v in itm.items(): # if v.shape[0] == 151: # print(itm) # break print(itm) print() # print(tokenizer.decode([int(t) for t in itm["input_ids"].numpy().tolist()])) break # l = hparams.dataset.outputs[0].labels[tf.argmax(itm["output_1"], -1).numpy().tolist()] # print(id_to_label[l]) # if id_to_label[l] not in label_counter: # label_counter[id_to_label[l]] = 0 # label_counter[id_to_label[l]] += 1 print(label_counter) print(len(label_counter)) # python -u aispace/trainer.py \ # --experiment_name test \ # --model_name bert_for_classification \ # --schedule train_and_eval \ # --config_name tnews \ # --config_dir ./configs/glue_zh \ # --gpus 0 1 2 3
def test_glue_load(self): hparams = Hparams() hparams.load_from_config_file("../configs/qa/dureader_yesno.yml") hparams.stand_by() checksum_dir = "../aispace/datasets/url_checksums" tfds.download.add_checksums_dir(checksum_dir) download_config = DownloadConfig(register_checksums=True) print(tfds.list_builders()) dureader = tfds.load( "dureader/yesno", # data_dir="/search/data1/yyk/data/datasets/glue_zh", data_dir="../data/dureader", builder_kwargs={'hparams': hparams}, download_and_prepare_kwargs={'download_config': download_config}) for itm in dureader['train']: print(itm) break print() # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False)) # test_dataset = next(load_dataset(hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True))[0] # total, zero = 0, 0 # for itm in tqdm(test_dataset): # tt = itm[0]['input_ids'].numpy().tolist() # print(itm[0]['p_mask'].numpy().tolist()) # print(itm[0]['start_position'].numpy().tolist()) # print(itm[0]['end_position'].numpy().tolist()) # print(tt) # break # total += 1 # zero += len([t for t in tt if t == 0]) # print() # print(f"{zero}, {total}, {zero / float(total)}") # print(total) # python -u aispace/trainer.py \ # --experiment_name test \ # --model_name bert_for_classification \ # --schedule train_and_eval \ # --config_name tnews \ # --config_dir ./configs/glue_zh \ # --gpus 0 1 2 3
def test_glue_load(self): hparams = Hparams() hparams.load_from_config_file("../configs/glue_zh/cmrc2018.yml") hparams.stand_by() # checksum_dir = "../aispace/datasets/url_checksums" # tfds.download.add_checksums_dir(checksum_dir) # download_config = DownloadConfig(register_checksums=True) # cmrc2018 = tfds.load("glue_zh/cmrc2018", # # data_dir="/search/data1/yyk/data/datasets/glue_zh", # data_dir="../data/glue_zh", # builder_kwargs={'hparams': hparams}, # download_and_prepare_kwargs={'download_config': download_config} # ) # train_dataset, dev_dataset, dataset_info = next(load_dataset(hparams, ret_test=False)) test_dataset = next( load_dataset(hparams, ret_train=False, ret_dev=True, ret_test=False, ret_info=False))[0] total, zero = 0, 0 for itm in test_dataset: tt = itm[0]['start_position'].numpy().tolist() # print(itm[0]['p_mask'].numpy().tolist()) # print(itm[0]['start_position'].numpy().tolist()) # print(itm[0]['end_position'].numpy().tolist()) # break total += len(tt) zero += len([t for t in tt if t == 0]) print() print(f"{zero}, {total}, {zero / float(total)}") # python -u aispace/trainer.py \ # --experiment_name test \ # --model_name bert_for_classification \ # --schedule train_and_eval \ # --config_name tnews \ # --config_dir ./configs/glue_zh \ # --gpus 0 1 2 3
def test_electra_checkpoint(self): hparam = Hparams() hparam.load_from_config_file( '/search/data1/yyk/workspace/AiSpace/configs/glue_zh/tnews.yml') # hparam.load_from_config_file('/search/data1/yyk/workspace/AiSpace/configs/glue_zh/cmrc2018.yml') hparam.stand_by() # ckpt = "/search/data1/yyk/workspace/projects/ERNIE/ernie/checkpoints" ckpt = "/search/data1/yyk/data/pretrained/albert/albert_large_zh_google/model.ckpt-best" ckpt_vars = [ itm for itm in tf.train.list_variables(ckpt) if itm[0].find('adam') == -1 ] # ckpt_vars = [itm for itm in tf.train.list_variables(hparam.pretrained.model_path) if itm[0].find('adam') == -1] model, (losses, loss_weights), metrics, optimizer = build_model(hparam) model.compile(optimizer=optimizer, loss=losses, metrics=metrics) model_vars = model.trainable_variables print()
def evaluation(hparams: Hparams, checkpoints=None, model=None, test_dataset=None): """Evaluate the model and build report according to different task. :param model: :param test_dataset: :param hparams: :return: """ logger.info("Start Evaluate.") output_hparams = deepcopy(hparams.dataset.outputs) if test_dataset is None: test_dataset = next( load_dataset(hparams, ret_train=False, ret_dev=False, ret_info=False))[0] if model is None: # build model (model, ) = build_model(hparams, return_losses=False, return_metrics=False, return_optimizer=False) # predict using default model saved if checkpoints is None: # load weights if not os.path.exists(hparams.get_model_filename() + ".index"): logger.warning( f"Model from {hparams.get_model_filename()} is not exists, load nothing!" ) else: logger.info( f"Load model weights from {hparams.get_model_filename()}") model.load_weights(hparams.get_model_filename()) # prediction # print(model.evaluate(test_dataset)) for inputs, outputs in tqdm(test_dataset): model_outputs = model.predict(inputs) if not isinstance(model_outputs, (tuple, list)): model_outputs = (model_outputs, ) for idx, one_output_hparam in enumerate(output_hparams): if "ground_truth" not in one_output_hparam: one_output_hparam["ground_truth"] = [] if "predictions" not in one_output_hparam: one_output_hparam['predictions'] = [] prediction_output = tf.nn.softmax(model_outputs[idx], -1) tmp_name = one_output_hparam.name tmp_type = one_output_hparam.type tmp_ground_truth = outputs[tmp_name] if tmp_type in [CLASSLABEL, LIST_OF_CLASSLABEL, LIST_OF_INT]: if tmp_type in [LIST_OF_INT]: tmp_tg = tf.argmax(tmp_ground_truth, -1) else: tmp_tg = tmp_ground_truth if one_output_hparam.task == NER: # [[sent1], [sent2]] one_output_hparam.ground_truth.extend( tmp_tg.numpy().tolist()) tmp_predictions = tf.argmax(prediction_output, -1).numpy().tolist() one_output_hparam.predictions.extend(tmp_predictions) else: # [1, 0, 1, ...] one_output_hparam.ground_truth.extend( tmp_tg.numpy().reshape(-1).tolist()) tmp_predictions = tf.argmax( prediction_output, -1).numpy().reshape(-1).tolist() one_output_hparam.predictions.extend(tmp_predictions) elif isinstance(checkpoints, (tuple, list)): # predict using multi checkpints from k-fold cross validation. for i, ckpt in enumerate(checkpoints): if not os.path.exists(ckpt + ".index"): logger.warning( f"Model from {ckpt} is not exists, load nothing!") continue else: logger.info(f"Load model weights from {ckpt}") model.load_weights(ckpt) for j, (inputs, outputs) in tqdm(enumerate(test_dataset)): model_outputs = model.predict(inputs) if not isinstance(model_outputs, (tuple, list)): model_outputs = (model_outputs, ) for idx, one_output_hparam in enumerate(output_hparams): prediction_output = tf.nn.softmax(model_outputs[idx], -1) if i == 0: if "ground_truth" not in one_output_hparam: one_output_hparam["ground_truth"] = [] if "predictions" not in one_output_hparam: one_output_hparam['predictions'] = [] one_output_hparam['tmp_preds'] = [] one_output_hparam['tmp_preds'].append( prediction_output) tmp_name = one_output_hparam.name tmp_type = one_output_hparam.type tmp_ground_truth = outputs[tmp_name] if tmp_type in [ CLASSLABEL, LIST_OF_CLASSLABEL, LIST_OF_INT ]: if tmp_type in [LIST_OF_INT]: tmp_tg = tf.argmax(tmp_ground_truth, -1) else: tmp_tg = tmp_ground_truth if one_output_hparam.task == NER: # [[sent1], [sent2]] one_output_hparam.ground_truth.extend( tmp_tg.numpy().tolist()) else: # [1, 0, 1, ...] one_output_hparam.ground_truth.extend( tmp_tg.numpy().reshape(-1).tolist()) else: one_output_hparam['tmp_preds'][j] += prediction_output for idx, one_output_hparam in enumerate(output_hparams): prediction_output = one_output_hparam['tmp_preds'][idx] tmp_type = one_output_hparam.type if tmp_type in [CLASSLABEL, LIST_OF_CLASSLABEL, LIST_OF_INT]: if one_output_hparam.task == NER: # [[sent1], [sent2]] tmp_predictions = tf.argmax(prediction_output, -1).numpy().tolist() one_output_hparam.predictions.extend(tmp_predictions) else: # [1, 0, 1, ...] tmp_predictions = tf.argmax( prediction_output, -1).numpy().reshape(-1).tolist() one_output_hparam.predictions.extend(tmp_predictions) # save reports report_folder = hparams.get_report_dir() # evaluation, TODO more reports for one_output_hparam in output_hparams: ground_truth = one_output_hparam.ground_truth predictions = one_output_hparam.predictions if one_output_hparam.type in [ CLASSLABEL, LIST_OF_CLASSLABEL, LIST_OF_INT ]: # some filename cur_report_folder = os.path.join( report_folder, f'{one_output_hparam.name}_{one_output_hparam.type.lower()}') if not os.path.exists(cur_report_folder): os.makedirs(cur_report_folder) if one_output_hparam.task == NER: labels = one_output_hparam.labels # confusion matrix cm = ConfusionMatrix(_2d_to_1d_list(ground_truth), _2d_to_1d_list(predictions), labels) # ner evaluation labels = list( set([ itm[2:] for itm in labels if itm.startswith("B-") or itm.startswith("I-") ])) ner_eval = NEREvaluator( _id_to_label(ground_truth, one_output_hparam.labels), _id_to_label(predictions, one_output_hparam.labels), labels) ner_results, ner_results_agg = ner_eval.evaluate() save_json(os.path.join(cur_report_folder, "ner_results.json"), ner_results) save_json( os.path.join(cur_report_folder, "ner_results_agg.json"), ner_results_agg) else: cm = ConfusionMatrix(ground_truth, predictions, one_output_hparam.labels) # print some reports print_boxed(f"{one_output_hparam.name} Evaluation") cms = cm.confusion_matrix_visual() if len(cm.label2idx) < 10: print(cms) # save reports to files with open( os.path.join(cur_report_folder, "confusion_matrix.txt"), 'w') as f: f.write(cms) print() print(json.dumps(cm.stats(), indent=4)) save_json(os.path.join(cur_report_folder, "stats.json"), cm.stats()) save_json(os.path.join(cur_report_folder, 'per_class_stats.json'), cm.per_class_stats()) # save reports to hparams hparams['performance'] = Hparams() hparams.performance["stats"] = cm.stats() hparams.performance["per_class_stats"] = cm.per_class_stats() logger.info( f"Save {one_output_hparam.name} reports in {cur_report_folder}" ) else: logger.warning( f"{one_output_hparam.name}'s evaluation has not be implemented." )
def load_dataset(hparams: Hparams, ret_train=True, ret_dev=True, ret_test=True, ret_info=True): from aispace import datasets train_split, validation_split, test_split = get_dataset_split(hparams) if ret_train: train_datasets, dataset_info = build_dataset(hparams, train_split, with_info=True) if ret_dev: dev_datasets, dev_dataset_info = build_dataset(hparams, validation_split, with_info=True) if dev_dataset_info is not None: dataset_info = dev_dataset_info if ret_test: test_datasets, test_dataset_info = build_dataset(hparams, test_split, with_info=True) if test_dataset_info is not None: dataset_info = test_dataset_info # check the consistence of tokenizer using in building dataset and now using. if hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") != "": if dataset_info.metadata is None: logger.warning("dataset_info has no metadata attribute.") elif hparams.get("dataset", {}).get("tokenizer", {}).get("name", "") \ != dataset_info.metadata.get("tokenizer", ""): raise ValueError( f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, ' f'however, now is using {hparams.get("dataset", {}).get("tokenizer", {}).get("name", "")}, ' f'please remove/rebuild the data and restart!') elif hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0) \ != dataset_info.metadata.get("vocab_size", 0): raise ValueError( f'The dataset is built using tokenizer {dataset_info.metadata.get("tokenizer", "")}, ' f'whose vocab size is {dataset_info.metadata.get("vocab_size", "xx")},' f'however, now is {hparams.get("pretrained", {}).get("config", {}).get("vocab_size", 0)}, ' f'please remove/rebuild the data and restart!') # data mapping def build_generator(fields): input_names = [itm.get('name') for itm in hparams.dataset.inputs] output_names = [itm.get('name') for itm in hparams.dataset.outputs] output_name2column = { itm.get('name'): itm.get('column') for itm in hparams.dataset.outputs } inputs, outputs = {}, {} for k, v in fields.items(): if k in input_names: inputs[k] = v elif k in output_names: inputs[output_name2column.get(k, k)] = v outputs[k] = v else: raise ValueError(f"{k} not in inputs or outputs.") return inputs, outputs training_hparams = hparams.training # reset some hparams if ret_info: print(dataset_info) # train_data_size = dataset_info.splits.get("train").num_examples # validation_data_size = dataset_info.splits.get("validation").num_examples # test_data_size = dataset_info.splits.get("test").num_examples # steps_per_epoch = int(train_data_size / training_hparams.batch_size) # num_warmup_steps = \ # int( # training_hparams.max_epochs * train_data_size * training_hparams.warmup_factor / training_hparams.batch_size) # num_warmup_steps = min(steps_per_epoch, num_warmup_steps) # if validation_data_size is not None: # validation_steps = validation_data_size // training_hparams.batch_size # else: # validation_steps = None # # if test_data_size is not None: # test_steps = test_data_size // training_hparams.batch_size # else: # test_steps = None for i in range(len(train_split)): # build batch if ret_train: if train_datasets is not None and train_datasets[i] is not None: # get train_steps and reset training hparams logger.info( "Reset training hparams according to real training data info." ) steps_per_epoch = 0 for _ in train_datasets[i]: steps_per_epoch += 1 steps_per_epoch //= training_hparams.batch_size num_warmup_steps = \ int(training_hparams.max_epochs * steps_per_epoch * training_hparams.warmup_factor) if "num_warmup_steps" not in training_hparams or training_hparams.num_warmup_steps <= 0: hparams.cascade_set('training.num_warmup_steps', num_warmup_steps) logger.info( f"Set training.num_warmup_steps to {num_warmup_steps}") else: logger.info( f"Get training.num_warmup_steps is {hparams.training.num_warmup_steps}" ) if "steps_per_epoch" not in training_hparams or training_hparams.steps_per_epoch <= 0: hparams.cascade_set('training.steps_per_epoch', steps_per_epoch) logger.info( f"Set training.steps_per_epoch to {steps_per_epoch}") else: logger.info( f"Get training.steps_per_epoch is {hparams.training.steps_per_epoch}" ) # prepare train dataset train_dataset = train_datasets[i].\ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ shuffle(hparams.training.shuffle_size).\ repeat(). \ batch(hparams.training.batch_size) logger.info("Train dataset has loaded.") else: train_dataset = None logger.info("Train dateset get None.") if ret_dev: if dev_datasets is not None and dev_datasets[i] is not None: logger.info( "Reset validation hparams according to real validation data info." ) validation_steps = 0 for _ in dev_datasets[i]: validation_steps += 1 validation_steps //= training_hparams.batch_size if "validation_steps" not in training_hparams or training_hparams.validation_steps <= 0: hparams.cascade_set('training.validation_steps', validation_steps) logger.info( f"Set training.validation_steps to {validation_steps}") else: logger.info( f"Get training.validation_steps is {hparams.training.validation_steps}" ) dev_dataset = dev_datasets[i].\ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ repeat(). \ batch(hparams.training.batch_size) logger.info("Validation dataset has loaded.") else: dev_dataset = None logger.info("Validation dataset get None.") if ret_test: if test_datasets is not None and test_datasets[i] is not None: logger.info( "Reset test hparams according to real test data info.") test_steps = 0 for _ in test_datasets[i]: test_steps += 1 test_steps //= training_hparams.batch_size if "test_steps" not in training_hparams or training_hparams.test_steps <= 0: hparams.cascade_set('training.test_steps', test_steps) logger.info(f"Set training.test_steps to {test_steps}") else: logger.info( f"Get training.test_steps is {hparams.training.test_steps}" ) test_dataset = test_datasets[i]. \ map(build_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE). \ prefetch(buffer_size=tf.data.experimental.AUTOTUNE). \ batch(hparams.training.batch_size) logger.info("Test dataset has loaded.") else: test_dataset = None logger.info("Test dataset get None.") result = () if ret_train: result += (train_dataset, ) if ret_dev: result += (dev_dataset, ) if ret_test: result += (test_dataset, ) if ret_info: result += (dataset_info, ) yield result
def build_model(hparam: Hparams, return_losses=True, return_metrics=True, return_optimizer=True, stage=TRAIN_STAGE): """Build custom keras model, losses, metrics, and optimizer :param hparam: :param return_losses: :param return_metrics: :param return_optimizer: :return: """ logger.info(f"Try to build model {hparam.model_name}") from aispace import models from aispace.models.base_model import BaseModel model = BaseModel.by_name(hparam.model_name)(hparam) # build inputs and model inputs = build_tf_model_inputs(hparam.dataset) model(inputs, training=True) rets = () # build losses if return_losses: losses, loss_weights = build_tf_model_losses(model, hparam.dataset) rets += ((losses, loss_weights), ) # build metrics if return_metrics: metrics = build_tf_model_metrics(hparam.dataset) rets += (metrics, ) # build optimizer if return_optimizer: optimizer = build_tf_model_optimizer(hparam.training) rets += (optimizer, ) # if stage == TRAIN_STAGE: model.summary() # init from pretrained model (language or etc.,) if stage == TRAIN_STAGE and not hparam.model_resume_path and not hparam.model_load_path \ and "pretrained" in hparam and hparam.pretrained.init_from_pretrained: try: logger.info(f"Load weights from {hparam.pretrained.model_path}") if hparam.pretrained.model_path.endswith(".h5"): model.load_weights(hparam.pretrained.model_path, by_name=True) else: logger.info( f"Load weights using model adapter {hparam.pretrained.adapter}" ) adapter = build_model_adapter(hparam.pretrained) if adapter is not None: adapter(model.trainable_variables, hparam.pretrained.model_path) except Exception as e: logging.error("Load weights failure!", exc_info=True) raise e # initializer model if stage == TRAIN_STAGE and not hparam.model_resume_path and hparam.model_load_path is not None: model_saved = os.path.join(hparam.model_load_path, "model_saved", "model") logger.info(f"Initialize model from {model_saved}") model.load_weights(model_saved) # resume model if stage == TRAIN_STAGE and hparam.model_resume_path is not None: model_saved = os.path.join(hparam.get_workspace_dir(), "model_saved", "model") logger.info(f"Resume model from {model_saved}") model.load_weights(model_saved) return (model, ) + rets
def experiment(hparams: Hparams): logger = logging.getLogger(__name__) if hparams.use_mixed_float16: logger.info("Use auto mixed policy") # tf.keras.mixed_precision.experimental.set_policy('mixed_float16') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{id}" for id in hparams.gpus]) # build dataset train_dataset, dev_dataset, dataset_info = next( load_dataset(hparams, ret_test=False)) with strategy.scope(): # build model model, (losses, loss_weights), metrics, optimizer = build_model(hparams) # build callbacks callbacks = build_callbacks(hparams) # compile model.compile(optimizer=optimizer, loss=losses, metrics=metrics, loss_weights=loss_weights) # fit if hparams.training.do_eval: validation_data = dev_dataset validation_steps = hparams.training.validation_steps else: logger.info("Do not evaluate.") validation_data = None validation_steps = None model.fit( train_dataset, validation_data=validation_data, epochs=hparams.training.max_epochs, callbacks=callbacks, steps_per_epoch=hparams.training.steps_per_epoch, validation_steps=validation_steps, ) # 进行lr finder lr_finder_call_back = [ cb for cb in callbacks if hasattr(cb, "lr_finder_plot") ] if len(lr_finder_call_back) != 0: logger.info( f"Do lr finder, and save result in {hparams.get_lr_finder_jpg_file()}" ) lr_finder_call_back[0].lr_finder_plot(hparams.get_lr_finder_jpg_file()) else: # load best model checkpoint_dir = os.path.join(hparams.get_workspace_dir(), "checkpoint") if hparams.eval_use_best and os.path.exists(checkpoint_dir): logger.info(f"Load best model from {checkpoint_dir}") average_checkpoints(model, checkpoint_dir) # save best model logger.info(f'Save model in {hparams.get_model_filename()}') model.save_weights(hparams.get_model_filename(), save_format="tf") # eval on test dataset and make reports if hparams.training.do_eval: evaluation(hparams) logger.info('Experiment Finish!')
def k_fold_experiment(hparams: Hparams): """ k_fold training :param hparams: :return: """ logger = logging.getLogger(__name__) if hparams.use_mixed_float16: logger.info("Use auto mixed policy") # tf.keras.mixed_precision.experimental.set_policy('mixed_float16') os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1' strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{id}" for id in hparams.gpus]) # build dataset model_saved_dirs = [] for idx, (train_dataset, dev_dataset, dataset_info) in enumerate(load_dataset(hparams, ret_test=False)): logger.info(f"Start {idx}th-fold training") with strategy.scope(): # build model model, (losses, loss_weights), metrics, optimizer = build_model(hparams) # build callbacks callbacks = build_callbacks(hparams.training.callbacks) # compile model.compile(optimizer=optimizer, loss=losses, metrics=metrics, loss_weights=loss_weights) # fit if hparams.training.do_eval: validation_data = dev_dataset validation_steps = hparams.training.validation_steps else: logger.info("Do not evaluate.") validation_data = None validation_steps = None model.fit( train_dataset, validation_data=validation_data, epochs=hparams.training.max_epochs, callbacks=callbacks, steps_per_epoch=hparams.training.steps_per_epoch, validation_steps=validation_steps, ) # build archive dir k_fold_dir = os.path.join(hparams.get_workspace_dir(), "k_fold", str(idx)) if not os.path.exists(k_fold_dir): os.makedirs(k_fold_dir) # load best model checkpoint_dir = os.path.join(hparams.get_workspace_dir(), "checkpoint") if hparams.eval_use_best and os.path.exists(checkpoint_dir): logger.info(f"Load best model from {checkpoint_dir}") average_checkpoints(model, checkpoint_dir) logger.info(f"Move {checkpoint_dir, k_fold_dir}") shutil.move(checkpoint_dir, k_fold_dir) # save best model logger.info( f'Save {idx}th model in {hparams.get_model_filename()}') model.save_weights(hparams.get_model_filename(), save_format="tf") # eval on test dataset and make reports evaluation(hparams) logger.info(f"Move {hparams.get_report_dir()} to {k_fold_dir}") shutil.move(hparams.get_report_dir(), k_fold_dir) logger.info(f"Move {hparams.get_saved_model_dir()} to {k_fold_dir}") cur_model_saved_dir = shutil.move(hparams.get_saved_model_dir(), k_fold_dir) logger.info( f"New model saved path for {idx}th fold: {cur_model_saved_dir}") model_saved_dirs.append(cur_model_saved_dir) logger.info(f'{idx}th-fold experiment Finish!') # eval on test dataset after average_checkpoints # logger.info("Average models of all fold models.") checkpoints = [f'{itm}/model' for itm in model_saved_dirs] # average_checkpoints(model, checkpoints) # logger.info(f"Save averaged model in {hparams.get_model_filename()}") # model.save_weights(hparams.get_model_filename(), save_format="tf") if hparams.training.do_eval: evaluation(hparams, checkpoints=checkpoints) logger.info('Experiment Finish!')
def test_dataset_split(self): hparams = Hparams() hparams.load_from_config_file("../configs/glue_zh/tnews.yml") hparams.stand_by() k_fold_experiment(hparams)