def train(): rank_id = 0 if args.run_distribute: context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() rank_id = get_rank() # dataset/network/criterion/optim ds = train_dataset_creator(rank_id, args.device_num) step_size = ds.get_dataset_size() print('Create dataset done!') config.INFERENCE = False net = ETSNet(config) net = net.set_train() param_dict = load_checkpoint(args.pre_trained) load_param_into_net(net, param_dict) print('Load Pretrained parameters done!') criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE) lrs = dynamic_lr(config.BASE_LR, config.TRAIN_TOTAL_ITER, config.WARMUP_STEP, config.WARMUP_RATIO) opt = nn.SGD(params=net.trainable_params(), learning_rate=lrs, momentum=0.99, weight_decay=5e-4) # warp model net = WithLossCell(net, criterion) if args.run_distribute: net = TrainOneStepCell(net, opt, reduce_flag=True, mean=True, degree=args.device_num) else: net = TrainOneStepCell(net, opt) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossCallBack(per_print_times=10) # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=2) ckpoint_cb = ModelCheckpoint(prefix="ETSNet", config=ckpoint_cf, directory="./ckpt_{}".format(rank_id)) model = Model(net) model.train(config.TRAIN_REPEAT_NUM, ds, dataset_sink_mode=True, callbacks=[time_cb, loss_cb, ckpoint_cb])
def test_pynative_resnet50(): context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend") batch_size = 32 num_classes = 10 loss_scale = 128 total_step = 50 net = resnet50(batch_size, num_classes) optimizer = Momentum(learning_rate=0.01, momentum=0.9, params=filter(lambda x: x.requires_grad, net.get_parameters())) data_set = create_dataset(repeat_num=1, training=True, batch_size=batch_size, num_samples=total_step * batch_size) # define callbacks time_cb = MyTimeMonitor(data_size=data_set.get_dataset_size()) loss_cb = LossMonitor() cb = [time_cb, loss_cb] loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss_scale = FixedLossScaleManager(loss_scale=loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=optimizer, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False) # train model model.train(1, data_set, callbacks=cb, sink_size=data_set.get_dataset_size(), dataset_sink_mode=True) assert time_cb.good_step() > 10
def do_eval(dataset=None, network=None, use_crf="", num_class=41, assessment_method="accuracy", data_file="", load_checkpoint_path="", vocab_file="", label_file="", tag_to_index=None, batch_size=1): """ do eval """ if load_checkpoint_path == "": raise ValueError("Finetune model missed, evaluation task must load finetune model!") net_for_pretraining = network(bert_net_cfg, batch_size, False, num_class, use_crf=(use_crf.lower() == "true"), tag_to_index=tag_to_index) net_for_pretraining.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) if assessment_method == "clue_benchmark": from src.cluener_evaluation import submit submit(model=model, path=data_file, vocab_file=vocab_file, use_crf=use_crf, label_file=label_file, tag_to_index=tag_to_index) else: if assessment_method == "accuracy": callback = Accuracy() elif assessment_method == "f1": callback = F1((use_crf.lower() == "true"), num_class) elif assessment_method == "spanf1": callback = SpanF1((use_crf.lower() == "true"), tag_to_index) elif assessment_method == "mcc": callback = MCC() elif assessment_method == "spearman_correlation": callback = Spearman_Correlation() else: raise ValueError("Assessment method not supported, support: [accuracy, f1, mcc, spearman_correlation]") columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] for data in dataset.create_dict_iterator(num_epochs=1): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, token_type_id, label_ids = input_data logits = model.predict(input_ids, input_mask, token_type_id, label_ids) callback.update(logits, label_ids) print("==============================================================") eval_result_print(assessment_method, callback) print("==============================================================")
def test_optimizer_cpu(): context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU") network = Net() lr = 0.01 momentum = 0.9 micro_batches = 2 loss = nn.SoftmaxCrossEntropyWithLogits() factory = DPOptimizerClassFactory(micro_batches) factory.set_mechanisms('Gaussian', norm_bound=1.5, initial_noise_multiplier=5.0) net_opt = factory.create('SGD')(params=network.trainable_params(), learning_rate=lr, momentum=momentum) _ = Model(network, loss_fn=loss, optimizer=net_opt, metrics=None)
def get_tensor_from_training( indices, ckpt_file="/tmp/pycharm_project_589/summary_dir-202010191622/weights/-1_350.ckpt", node_name="conv1.weight", data_type="gradient"): context.set_context(reserve_class_name_in_scope=False) net = resnet50(batch_size, num_classes) load_checkpoint(ckpt_file, net=net) ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'}) dataset = create_dataset(indices) data_inception_callback = DataInterceptionCallback(node_name=node_name, data_type=data_type) model.train(1, dataset, callbacks=[LossMonitor(), data_inception_callback], dataset_sink_mode=False) return data_inception_callback.result
def bn_common(parallel_mode, train_flag, strategy_loss=None): context.set_context(mode=context.GRAPH_MODE) context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=8) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 rank_size = 8 predict = Tensor(np.ones([32, 512]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = bn_net() loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss.softmax_cross_entropy.shard(strategy_loss) opt = Momentum(net.trainable_params(), learning_rate, momentum, 0.0001, 1024 * rank_size) if not train_flag: net = WithLossCell(net, loss) net.set_train() if parallel_mode == ParallelMode.DATA_PARALLEL: context.set_auto_parallel_context(parameter_broadcast=True) model = Model(net, loss, opt) if train_flag: model.train(epoch_size, dataset, dataset_sink_mode=False) else: model._predict(predict, label)
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path=""): """ do train """ if load_checkpoint_path == "": raise ValueError("Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() epoch_num = dataset.get_repeat_count() # optimizer if optimizer_cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num, learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=optimizer_cfg.AdamWeightDecayDynamicLR.power, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), weight_decay=optimizer_cfg.AdamWeightDecayDynamicLR.weight_decay, eps=optimizer_cfg.AdamWeightDecayDynamicLR.eps) elif optimizer_cfg.optimizer == 'Lamb': optimizer = Lamb(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num, start_learning_rate=optimizer_cfg.Lamb.start_learning_rate, end_learning_rate=optimizer_cfg.Lamb.end_learning_rate, power=optimizer_cfg.Lamb.power, weight_decay=optimizer_cfg.Lamb.weight_decay, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_filter=optimizer_cfg.Lamb.decay_filter) elif optimizer_cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate, momentum=optimizer_cfg.Momentum.momentum) else: raise Exception("Optimizer not supported. support: [AdamWeightDecayDynamicLR, Lamb, Momentum]") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix="classifier", directory=save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(network, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb] model.train(epoch_num, dataset, callbacks=callbacks)
def test_sit_auto_mix_precision_model_o0(): input_data = np.random.randn(32, 3, 224, 224).astype(np.float32) dataset1 = FakeData(size=32, batch_size=32, image_size=(3, 224, 224), num_classes=10, fakedata_mode=FakeDataInitMode.OnesInit) dataset1.set_label_data_type(np.float16) # graph mode context.set_context(mode=context.GRAPH_MODE) context.set_context(save_graphs=True, save_graphs_path='./test_amp_o0') net = Net(3, 10) net.to_float(dtype.float16) opt = nn.Momentum(params=net.trainable_params(), learning_rate=0.001, momentum=0.0009) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=False) model = Model(net, loss, opt, amp_level="O0") model.train(1, dataset1, dataset_sink_mode=False) contend = read_validateir_file('./test_amp_o0') castnum = re.findall("Cast", contend) assert len(castnum) == 17 model.predict(Tensor(input_data)) contend = read_validateir_file('./test_amp_o0') castnum = re.findall("Cast", contend) assert len(castnum) == 11
def main(): """Main entrance for training""" args = parser.parse_args() print(sys.argv) #context.set_context(mode=context.GRAPH_MODE) context.set_context(mode=context.PYNATIVE_MODE) if args.GPU: context.set_context(device_target='GPU', device_id=args.device_id) # parse model argument assert args.model.startswith( "hournas"), "Only Tinynet models are supported." net = nasbenchnet() cfg = edict({ 'image_height': args.image_size, 'image_width': args.image_size, }) cfg.batch_size = args.batch_size val_data_url = args.data_path val_dataset = create_dataset_cifar10(val_data_url, repeat_num=1, training=False, cifar_cfg=cfg) loss = LabelSmoothingCrossEntropy(smooth_factor=args.smoothing, num_classes=args.num_classes) loss.add_flags_recursive(fp32=True, fp16=False) eval_metrics = { 'Validation-Loss': Loss(), 'Top1-Acc': Top1CategoricalAccuracy(), 'Top5-Acc': Top5CategoricalAccuracy() } ckpt = load_checkpoint(args.ckpt) load_param_into_net(net, ckpt) net.set_train(False) model = Model(net, loss, metrics=eval_metrics) metrics = model.eval(val_dataset, dataset_sink_mode=False) print(metrics)
def do_eval(dataset=None, network=None, use_crf="", num_class=41, assessment_method="accuracy", data_file="", load_checkpoint_path="", vocab_file="", label_file="", tag_to_index=None, batch_size=1): """ do eval """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") net_for_pretraining = network(ernie_net_cfg, batch_size, False, num_class, use_crf=(use_crf.lower() == "true"), tag_to_index=tag_to_index) net_for_pretraining.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) callback = SpanF1((use_crf.lower() == "true"), tag_to_index) columns_list = ["input_ids", "input_mask", "token_type_id", "label_ids"] for data in dataset.create_dict_iterator(num_epochs=1): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, token_type_id, label_ids = input_data logits = model.predict(input_ids, input_mask, token_type_id, label_ids) callback.update(logits, label_ids) print("==============================================================") eval_result_print(assessment_method, callback) print("==============================================================")
def do_eval(dataset=None, network=None, num_class=2, assessment_method="accuracy", load_checkpoint_path=""): """ do eval """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") net_for_pretraining = network(bert_net_cfg, False, num_class) net_for_pretraining.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) if assessment_method == "accuracy": callback = Accuracy() elif assessment_method == "f1": callback = F1(False, num_class) elif assessment_method == "mcc": callback = MCC() elif assessment_method == "spearman_correlation": callback = Spearman_Correlation() else: raise ValueError( "Assessment method not supported, support: [accuracy, f1, mcc, spearman_correlation]" ) columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"] for data in dataset.create_dict_iterator(): input_data = [] for i in columns_list: input_data.append(Tensor(data[i])) input_ids, input_mask, token_type_id, label_ids = input_data logits = model.predict(input_ids, input_mask, token_type_id, label_ids) callback.update(logits, label_ids) print("==============================================================") eval_result_print(assessment_method, callback) print("==============================================================")
def bert_predict(): ''' Predict function ''' devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) dataset = get_enwiki_512_dataset(bert_net_cfg.batch_size, 1) net_for_pretraining = BertPretrainEva(bert_net_cfg) net_for_pretraining.set_train(False) param_dict = load_checkpoint(cfg.finetune_ckpt) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) return model, dataset, net_for_pretraining
def do_eval(dataset=None, load_checkpoint_path="", eval_batch_size=1): """ do eval """ if load_checkpoint_path == "": raise ValueError( "Finetune model missed, evaluation task must load finetune model!") net = BertSquad(bert_net_cfg, False, 2) net.set_train(False) param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net, param_dict) model = Model(net) output = [] RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) columns_list = ["input_ids", "input_mask", "segment_ids", "unique_ids"] for data in dataset.create_dict_iterator(num_epochs=1): input_data = [] for i in columns_list: input_data.append(data[i]) input_ids, input_mask, segment_ids, unique_ids = input_data start_positions = Tensor([1], mstype.float32) end_positions = Tensor([1], mstype.float32) is_impossible = Tensor([1], mstype.float32) logits = model.predict(input_ids, input_mask, segment_ids, start_positions, end_positions, unique_ids, is_impossible) ids = logits[0].asnumpy() start = logits[1].asnumpy() end = logits[2].asnumpy() for i in range(eval_batch_size): unique_id = int(ids[i]) start_logits = [float(x) for x in start[i].flat] end_logits = [float(x) for x in end[i].flat] output.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) return output
def run_fasttext_infer(): """run infer with FastText""" dataset = load_infer_dataset(batch_size=config.batch_size, datafile=args.data_path, bucket=config.test_buckets) fasttext_model = FastText(config.vocab_size, config.embedding_dims, config.num_class) parameter_dict = load_checkpoint(args.model_ckpt) load_param_into_net(fasttext_model, parameter_dict=parameter_dict) ft_infer = FastTextInferCell(fasttext_model) model = Model(ft_infer) predictions = [] target_sens = [] for batch in dataset.create_dict_iterator(output_numpy=True, num_epochs=1): target_sens.append(batch['label_idx']) src_tokens = Tensor(batch['src_tokens'], mstype.int32) src_tokens_length = Tensor(batch['src_tokens_length'], mstype.int32) predicted_idx = model.predict(src_tokens, src_tokens_length) predictions.append(predicted_idx.asnumpy()) from sklearn.metrics import accuracy_score, classification_report target_sens = np.array(target_sens).flatten() merge_target_sens = [] for target_sen in target_sens: merge_target_sens.extend(target_sen) target_sens = merge_target_sens predictions = np.array(predictions).flatten() merge_predictions = [] for prediction in predictions: merge_predictions.extend(prediction) predictions = merge_predictions acc = accuracy_score(target_sens, predictions) result_report = classification_report(target_sens, predictions, target_names=target_label1) print("********Accuracy: ", acc) print(result_report)
def eval_(): # set args dev = "GPU" compute_type = str(args_opt.dtype).lower() ckpt_dir = str(args_opt.ckpt_path) total_batch = int(args_opt.batch_size) # init context if args_opt.mode == "GRAPH": mode = context.GRAPH_MODE else: mode = context.PYNATIVE_MODE context.set_context(mode=mode, device_target=dev, save_graphs=False) # create dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, repeat_num=1, batch_size=total_batch, target=dev, dtype=compute_type) # define net net = resnet(class_num=1001, dtype=compute_type) # load checkpoint param_dict = load_checkpoint(ckpt_dir) load_param_into_net(net, param_dict) net.set_train(False) # define loss, model loss = CrossEntropySmooth(sparse=True, reduction='mean', smooth_factor=0.1, num_classes=1001) # define model model = Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) # eval model print("========START EVAL RESNET50 ON GPU ========") res = model.eval(dataset) print("result:", res, "ckpt=", ckpt_dir)
def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) context.set_context(enable_hccl=enable_hccl) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() context.set_context(mode=context.GRAPH_MODE) net = resnet50(batch_size, num_classes) loss = CrossEntropyLoss() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) dataset = create_dataset(epoch_size, training=True, batch_size=batch_size, rank_id=device_id, rank_size=device_num, enable_hccl=enable_hccl) batch_num = dataset.get_dataset_size() loss_cb = LossGet() model.train(epoch_size, dataset, callbacks=[loss_cb]) q.put(loss_cb.get_loss())
def epoch_end(self, run_context): """evaluate the model and ema-model at the end of each epoch""" cb_params = run_context.original_args() cur_epoch = cb_params.cur_epoch_num + self._start_epoch - 1 save_ckpt = (cur_epoch % self.save_epoch == 0) load_nparray_into_net(self.ema_network, self.shadow) model = Model(self.network, loss_fn=self.loss_fn, metrics=self.eval_metrics) model_ema = Model(self.ema_network, loss_fn=self.loss_fn, metrics=self.eval_metrics) acc = model.eval(self.eval_dataset, dataset_sink_mode=self.dataset_sink_mode) ema_acc = model_ema.eval(self.eval_dataset, dataset_sink_mode=self.dataset_sink_mode) print("Model Accuracy:", acc) print("EMA-Model Accuracy:", ema_acc) output = [{ "name": k, "data": Tensor(v) } for k, v in self.shadow.items()] self.ema_accuracy[cur_epoch] = ema_acc["Top1-Acc"] if self.best_ema_accuracy < ema_acc["Top1-Acc"]: self.best_ema_accuracy = ema_acc["Top1-Acc"] self.best_ema_epoch = cur_epoch save_checkpoint(output, "ema_best.ckpt") if self.best_accuracy < acc["Top1-Acc"]: self.best_accuracy = acc["Top1-Acc"] self.best_epoch = cur_epoch print("Best Model Accuracy: %s, at epoch %s" % (self.best_accuracy, self.best_epoch)) print("Best EMA-Model Accuracy: %s, at epoch %s" % (self.best_ema_accuracy, self.best_ema_epoch)) if save_ckpt: # Save the ema_model checkpoints ckpt = "{}-{}.ckpt".format("ema", cur_epoch) save_checkpoint(output, ckpt) save_checkpoint(output, "ema_last.ckpt") # Save the model checkpoints save_checkpoint(cb_params.train_network, "last.ckpt") print("Top 10 EMA-Model Accuracies: ") count = 0 for epoch in sorted(self.ema_accuracy, key=self.ema_accuracy.get, reverse=True): if count == 10: break print("epoch: %s, Top-1: %s)" % (epoch, self.ema_accuracy[epoch])) count += 1
def bert_predict(Evaluation): ''' prediction function ''' devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) dataset = get_dataset(bert_net_cfg.batch_size, 1) if cfg.use_crf: net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True, tag_to_index=tag_to_index, dropout_prob=0.0) else: net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels) net_for_pretraining.set_train(False) param_dict = load_checkpoint(cfg.finetune_ckpt) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) return model, dataset
def train(args_opt, config): if args_opt.run_distribute: init() context.set_auto_parallel_context(parallel_mode="data_parallel") ds = dataset_creator(args_opt.run_distribute) net = CNNCTC_Model(config.NUM_CLASS, config.HIDDEN_SIZE, config.FINAL_FEATURE_WIDTH) net.set_train(True) if config.CKPT_PATH != '': param_dict = load_checkpoint(config.CKPT_PATH) load_param_into_net(net, param_dict) print('parameters loaded!') else: print('train from scratch...') criterion = ctc_loss() opt = mindspore.nn.RMSProp(params=net.trainable_params(), centered=True, learning_rate=config.LR_PARA, momentum=config.MOMENTUM, loss_scale=config.LOSS_SCALE) net = WithLossCell(net, criterion) loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager( config.LOSS_SCALE, False) model = Model(net, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2") callback = LossCallBack() config_ck = CheckpointConfig( save_checkpoint_steps=config.SAVE_CKPT_PER_N_STEP, keep_checkpoint_max=config.KEEP_CKPT_MAX_NUM) ckpoint_cb = ModelCheckpoint(prefix="CNNCTC", config=config_ck, directory=config.SAVE_PATH) if args_opt.device_id == 0: model.train(config.TRAIN_EPOCHS, ds, callbacks=[callback, ckpoint_cb], dataset_sink_mode=False) else: model.train(config.TRAIN_EPOCHS, ds, callbacks=[callback], dataset_sink_mode=False)
def bert_predict(Evaluation): ''' prediction function ''' dataset = get_dataset(bert_net_cfg.batch_size, 1) if cfg.use_crf: net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels=len(tag_to_index), use_crf=True, tag_to_index=tag_to_index, dropout_prob=0.0) else: net_for_pretraining = Evaluation(bert_net_cfg, False, num_labels) net_for_pretraining.set_train(False) param_dict = load_checkpoint(cfg.finetune_ckpt) load_param_into_net(net_for_pretraining, param_dict) model = Model(net_for_pretraining) return model, dataset
epoch_size = config.epoch_size # get learning rate lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size)) if args_opt.pretrain_ckpt == "" or args_opt.freeze_layer != "backbone": loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, \ config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale) cb = config_ckpoint(config, lr, step_size) print("============== Starting Training ==============") model.train(epoch_size, dataset, callbacks=cb) print("============== End Training ==============") else: opt = Momentum(filter(lambda x: x.requires_grad, head_net.get_parameters()), lr, config.momentum, config.weight_decay) network = WithLossCell(head_net, loss) network = TrainOneStepCell(network, opt) network.set_train() features_path = args_opt.dataset_path + '_features' idx_list = list(range(step_size))
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) load_pretrain_model(args.pretrained, network, args) # lr scheduler lr = get_lr(args) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
elif args.lr_scheduler == 'step': lr = lr_steps(0, lr_init=args.lr_init, lr_max=args.lr_max, warmup_epochs=args.warmup_epochs, total_epochs=args.max_epoch, steps_per_epoch=batch_num) else: raise NotImplementedError(args.lr_scheduler) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) if args.dataset == "cifar10": loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) model = Model(network, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None) else: if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, amp_level="O2") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(config=ckpt_config,
lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay) loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False) if device_target == "Ascend": model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None) ckpt_save_dir = "./" else: # GPU model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=True, loss_scale_manager=None) ckpt_save_dir = "./ckpt_" + str(get_rank()) + "/" config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5,
def test_train(): ''' finetune function ''' target = args_opt.device_target if target == "Ascend": devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) elif target == "GPU": context.set_context(mode=context.GRAPH_MODE, device_target="GPU") else: raise Exception("Target error, GPU or Ascend is supported.") #BertCLSTrain for classification #BertNERTrain for sequence labeling if cfg.task == 'NER': if cfg.use_crf: netwithloss = BertNER(bert_net_cfg, True, num_labels=len(tag_to_index), use_crf=True, tag_to_index=tag_to_index, dropout_prob=0.1) else: netwithloss = BertNER(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) elif cfg.task == 'SQUAD': netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1) else: netwithloss = BertCLS(bert_net_cfg, True, num_labels=cfg.num_labels, dropout_prob=0.1) if cfg.task == 'SQUAD': dataset = get_squad_dataset(bert_net_cfg.batch_size, cfg.epoch_num) else: dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num) # optimizer steps_per_epoch = dataset.get_dataset_size() if cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num, learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1), weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps) elif cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=steps_per_epoch * cfg.epoch_num, start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, weight_decay=cfg.Lamb.weight_decay, warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1), decay_filter=cfg.Lamb.decay_filter) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) else: raise Exception("Optimizer not supported.") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config) param_dict = load_checkpoint(cfg.pre_training_ckpt) load_param_into_net(netwithloss, param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) if cfg.task == 'SQUAD': netwithgrads = BertSquadCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertFinetuneCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) if cfg.is_dynamic_loss_scale: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager( cfg.loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O3", loss_scale_manager=loss_scale_manager) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50, keep_checkpoint_max=cfg.keep_checkpoint_max) time_cb = TimeMonitor(data_size=batch_num) ckpt_save_dir = "./ckpt_" + str(rank) + "/" ckpoint_cb = ModelCheckpoint(prefix="train_tinydarknet_" + args_opt.dataset_name, directory=ckpt_save_dir, config=config_ck) loss_cb = LossMonitor() model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
def test_mobilenetv2_quant(): set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") config = config_ascend_quant print("training configure: {}".format(config)) epoch_size = config.epoch_size # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=dataset_path, config=config, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(bn_fold=True, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) # get learning rate lr = Tensor( get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum( filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") monitor = Monitor(lr_init=lr.asnumpy(), step_threshold=config.step_threshold) callback = [monitor] model.train(epoch_size, dataset, callbacks=callback, dataset_sink_mode=False) print("============== End Training ==============") expect_avg_step_loss = 2.32 avg_step_loss = np.mean(np.array(monitor.losses)) print("average step loss:{}".format(avg_step_loss)) assert avg_step_loss < expect_avg_step_loss
def test_resnet50_quant(): set_seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") config = config_quant print("training configure: {}".format(config)) epoch_size = config.epoch_size # define network net = resnet50_quant(class_num=config.class_num) net.set_train(True) # define loss if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) #loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) # define dataset dataset = create_dataset(dataset_path=dataset_path, config=config, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # convert fusion network to quantization aware network net = quant.convert_quant_network(net, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # get learning rate lr = Tensor( get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs, total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')) # define optimization opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) # define model #model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) model = Model(net, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") monitor = Monitor(lr_init=lr.asnumpy(), step_threshold=config.step_threshold) callbacks = [monitor] model.train(epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False) print("============== End Training ==============") expect_avg_step_loss = 2.40 avg_step_loss = np.mean(np.array(monitor.losses)) print("average step loss:{}".format(avg_step_loss)) assert avg_step_loss < expect_avg_step_loss
def run_general_distill(): """ run general distill """ parser = argparse.ArgumentParser(description='tinybert general distill') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="3", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--save_ckpt_step", type=int, default=100, help="Enable data sink, default is true.") parser.add_argument("--max_ckpt_num", type=int, default=1, help="Enable data sink, default is true.") parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default=1, help="Sink steps for each epoch, default is 1.") parser.add_argument("--save_ckpt_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_teacher_ckpt_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) context.set_context(variable_memory_max_size="30GB") save_ckpt_dir = os.path.join( args_opt.save_ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) if not os.path.exists(save_ckpt_dir): os.makedirs(save_ckpt_dir) if args_opt.distribute == "true": D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) else: rank = 0 device_num = 1 netwithloss = BertNetworkWithLoss_gd( teacher_config=bert_teacher_net_cfg, teacher_ckpt=args_opt.load_teacher_ckpt_path, student_config=bert_student_net_cfg, is_training=True, use_one_hot_embeddings=False) dataset = create_tinybert_dataset('gd', bert_teacher_net_cfg.batch_size, device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) dataset_size = dataset.get_dataset_size() print('dataset size: ', dataset_size) if args_opt.enable_data_sink == "true": repeat_count = args_opt.epoch_size * dataset.get_dataset_size( ) // args_opt.data_sink_steps time_monitor_steps = args_opt.data_sink_steps else: repeat_count = args_opt.epoch_size time_monitor_steps = dataset_size lr_schedule = BertLearningRate( learning_rate=common_cfg.AdamWeightDecay.learning_rate, end_learning_rate=common_cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(dataset_size * args_opt.epoch_size / 10), decay_steps=int(dataset_size * args_opt.epoch_size), power=common_cfg.AdamWeightDecay.power) params = netwithloss.trainable_params() decay_params = list(filter(common_cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': common_cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }, { 'order_params': params }] optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=common_cfg.AdamWeightDecay.eps) callback = [ TimeMonitor(time_monitor_steps), LossCallBack(), ModelSaveCkpt(netwithloss.bert, args_opt.save_ckpt_step, args_opt.max_ckpt_num, save_ckpt_dir) ] update_cell = DynamicLossScaleUpdateCell( loss_scale_value=common_cfg.loss_scale_value, scale_factor=common_cfg.scale_factor, scale_window=common_cfg.scale_window) netwithgrads = BertTrainWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(repeat_count, dataset, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
def train_on_ascend(): config = config_ascend_quant print("training args: {}".format(args_opt)) print("training configure: {}".format(config)) print("parallel args: rank_id {}, device_id {}, rank_size {}".format( rank_id, device_id, rank_size)) epoch_size = config.epoch_size # distribute init if run_distribute: context.set_auto_parallel_context( device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() # define network network = mobilenetV2(num_classes=config.num_classes) # define loss if config.label_smooth > 0: loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes) else: loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define dataset dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config, device_target=args_opt.device_target, repeat_num=1, batch_size=config.batch_size) step_size = dataset.get_dataset_size() # load pre trained ckpt if args_opt.pre_trained: param_dict = load_checkpoint(args_opt.pre_trained) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, bn_fold=True, per_channel=[True, False], symmetric=[True, False]) # get learning rate lr = Tensor( get_lr(global_step=config.start_epoch * step_size, lr_init=0, lr_end=0, lr_max=config.lr, warmup_epochs=config.warmup_epochs, total_epochs=epoch_size + config.start_epoch, steps_per_epoch=step_size)) # define optimization opt = nn.Momentum( filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum, config.weight_decay) # define model model = Model(network, loss_fn=loss, optimizer=opt) print("============== Starting Training ==============") callback = None if rank_id == 0: callback = [Monitor(lr_init=lr.asnumpy())] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=config.save_checkpoint_path, config=config_ck) callback += [ckpt_cb] model.train(epoch_size, dataset, callbacks=callback) print("============== End Training ==============")