Esempio n. 1
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.platform, save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
                                          parameter_broadcast=True, mirror_mean=True)
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, 1,
                                        args.rank, args.group_size, num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone, args.num_classes, platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(args.lr,
                            args.lr_epochs,
                            args.steps_per_epoch,
                            args.warmup_epochs,
                            args.max_epoch,
                            gamma=args.lr_gamma,
                            )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr,
                                        args.steps_per_epoch,
                                        args.warmup_epochs,
                                        args.max_epoch,
                                        args.T_max,
                                        args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)


    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    if args.platform == "Ascend":
        model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
                      metrics={'acc'}, amp_level="O3")
    else:
        model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
                      metrics={'acc'}, amp_level="O2")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
                                       keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
Esempio n. 2
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument('--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'],
                        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"],
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=["true", "false"],
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale", type=str, default="true", choices=["true", "false"],
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"],
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"],
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--accumulation_steps", type=int, default="1",
                        help="Accumulating gradients N times before weight update, default is 1.")
    parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, "
                                                                                "default is 1000.")
    parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, "
                                                                    "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init()
            device_num = D.get_group_size()
            rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                          device_num=device_num)
        _set_bert_all_reduce_split()
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size() // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        train_steps = args_opt.train_steps * args_opt.accumulation_steps
        new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size() // args_opt.accumulation_steps
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size())]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value,
                                                 scale_factor=cfg.scale_factor,
                                                 scale_window=cfg.scale_window)

        if args_opt.accumulation_steps <= 1:
            net_with_grads = BertTrainOneStepWithLossScaleCell(net_with_loss, optimizer=optimizer,
                                                               scale_update_cell=update_cell)
        else:
            accumulation_steps = args_opt.accumulation_steps
            net_with_grads = BertTrainAccumulateStepsWithLossScaleCell(net_with_loss, optimizer=optimizer,
                                                                       scale_update_cell=update_cell,
                                                                       accumulation_steps=accumulation_steps,
                                                                       enable_global_norm=cfg.enable_global_norm)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)

    model = Model(net_with_grads)
    model.train(new_repeat_count, ds, callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
Esempio n. 3
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            tokenizer_file="",
            top_k=None,
            top_p=None,
            temperature=None,
            generate_length=None):
    """
    Do evaluation on summarization
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")
    if metric.lower() == "rouge":
        print("Prepare to calculate the Rouge score ...")
        callback = Rouge()

        gpt2_loss = network(config=gpt2_net_cfg,
                            is_training=False,
                            use_one_hot_embeddings=False)
        gpt2_loss.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)

        reorganized_param_dict = modify_paramdict(param_dict,
                                                  mode=eval_type,
                                                  model_prefix="gpt2.")
        load_param_into_net(gpt2_loss, reorganized_param_dict)

        # load nn.Cell into Model and initiate tokenizer and Sample
        model = Model(gpt2_loss)
        tokenizer = Tokenizer(vocab_file=tokenizer_file + 'gpt2-vocab.json',
                              merge_file=tokenizer_file + 'gpt2-merges.txt')

        # load data and process text generation
        columns_list = ["input_ids", "input_mask", "label_ids"]

        summarization_generator = GenerateForSummarization(
            model,
            config=gpt2_net_cfg,
            tokenizer=tokenizer,
            select_sentence=3,
            eval_type=eval_type,
            topk=top_k,
            topp=float(top_p),
            temperature=float(temperature),
            generate_length=generate_length)
        num_data = 1
        print(
            "==================== [Summrization] Testing ====================")
        for data in dataset.create_dict_iterator():
            input_data = []
            for value in columns_list:
                input_data.append(data[value])
            input_ids, _, label_ids = input_data
            print(" | [ROUGE] number : {} / {} ".format(
                num_data, dataset.get_dataset_size()))
            print("input_ids shape: {}".format(input_ids.shape))
            print("label_ids shape: {}".format(label_ids.shape))

            hypothesis, ref = summarization_generator.generate_for_summarization(
                input_ids)
            if ref[0] == '' or ref[0] is None:
                print("Sorry ref_list is None, skip it!")
                continue

            print("REF str:\n ", ref, "\nHYPO str:\n", hypothesis, "\n")
            for batch_idx in range(gpt2_net_cfg.batch_size):
                hypothesis[batch_idx] = clean_hypo(hypothesis[batch_idx])
            for batch_idx in range(gpt2_net_cfg.batch_size):
                hypothesis[batch_idx] = hypothesis[batch_idx].lower()
                ref[batch_idx] = ref[batch_idx].lower()

            callback.update(hypothesis, ref)
            num_data += 1

        print("\n\n")
        print("**********************************************************")
        eval_result_print(metric, callback)
        print("******************** Testing Finished ********************")

    else:
        raise ValueError(
            "metric method not supported in summarization, support: [Rouge]")
Esempio n. 4
0
def maskrcnn_eval(dataset_path, ckpt_path, ann_file):
    """MaskRcnn evaluation."""
    ds = create_maskrcnn_dataset(dataset_path,
                                 batch_size=config.test_batch_size,
                                 is_training=False)

    net = Mask_Rcnn_Resnet50(config)
    param_dict = load_checkpoint(ckpt_path)
    load_param_into_net(net, param_dict)
    net.set_train(False)

    eval_iter = 0
    total = ds.get_dataset_size()
    outputs = []
    dataset_coco = COCO(ann_file)

    print("\n========================================\n")
    print("total images num: ", total)
    print("Processing, please wait a moment.")
    max_num = 128
    for data in ds.create_dict_iterator(output_numpy=True, num_epochs=1):
        eval_iter = eval_iter + 1

        img_data = data['image']
        img_metas = data['image_shape']
        gt_bboxes = data['box']
        gt_labels = data['label']
        gt_num = data['valid_num']
        gt_mask = data["mask"]

        start = time.time()
        # run net
        output = net(Tensor(img_data), Tensor(img_metas), Tensor(gt_bboxes),
                     Tensor(gt_labels), Tensor(gt_num), Tensor(gt_mask))
        end = time.time()
        print("Iter {} cost time {}".format(eval_iter, end - start))

        # output
        all_bbox = output[0]
        all_label = output[1]
        all_mask = output[2]
        all_mask_fb = output[3]

        for j in range(config.test_batch_size):
            all_bbox_squee = np.squeeze(all_bbox.asnumpy()[j, :, :])
            all_label_squee = np.squeeze(all_label.asnumpy()[j, :, :])
            all_mask_squee = np.squeeze(all_mask.asnumpy()[j, :, :])
            all_mask_fb_squee = np.squeeze(all_mask_fb.asnumpy()[j, :, :, :])

            all_bboxes_tmp_mask = all_bbox_squee[all_mask_squee, :]
            all_labels_tmp_mask = all_label_squee[all_mask_squee]
            all_mask_fb_tmp_mask = all_mask_fb_squee[all_mask_squee, :, :]

            if all_bboxes_tmp_mask.shape[0] > max_num:
                inds = np.argsort(-all_bboxes_tmp_mask[:, -1])
                inds = inds[:max_num]
                all_bboxes_tmp_mask = all_bboxes_tmp_mask[inds]
                all_labels_tmp_mask = all_labels_tmp_mask[inds]
                all_mask_fb_tmp_mask = all_mask_fb_tmp_mask[inds]

            bbox_results = bbox2result_1image(all_bboxes_tmp_mask,
                                              all_labels_tmp_mask,
                                              config.num_classes)
            segm_results = get_seg_masks(all_mask_fb_tmp_mask,
                                         all_bboxes_tmp_mask,
                                         all_labels_tmp_mask, img_metas[j],
                                         True, config.num_classes)
            outputs.append((bbox_results, segm_results))

    eval_types = ["bbox", "segm"]
    result_files = results2json(dataset_coco, outputs, "./results.pkl")
    coco_eval(result_files, eval_types, dataset_coco, single_result=False)
Esempio n. 5
0
                                        device_num=device_num,
                                        rank_id=rank)

    dataset_size = dataset.get_dataset_size()
    print("Create dataset done!")

    net = Faster_Rcnn_Resnet50(config=config)
    net = net.set_train()

    load_path = args_opt.pre_trained
    if load_path != "":
        param_dict = load_checkpoint(load_path)
        for item in list(param_dict.keys()):
            if not item.startswith('backbone'):
                param_dict.pop(item)
        load_param_into_net(net, param_dict)

    loss = LossNet()
    lr = Tensor(dynamic_lr(config, rank_size=device_num), mstype.float32)

    opt = SGD(params=net.trainable_params(),
              learning_rate=lr,
              momentum=config.momentum,
              weight_decay=config.weight_decay,
              loss_scale=config.loss_scale)
    net_with_loss = WithLossCell(net, loss)
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               net,
                               opt,
                               sens=config.loss_scale,
Esempio n. 6
0
def main():
    args_opt = get_args()
    rank = 0
    device_num = 1
    if args_opt.run_platform == "CPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
    else:
        context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.run_platform, device_id=args_opt.device_id)
        if args_opt.distribute:
            device_num = args_opt.device_num
            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                              device_num=device_num)
            init()
            if config.model == "ssd_resnet50_fpn":
                context.set_auto_parallel_context(all_reduce_fusion_config=[90, 183, 279])
            else:
                context.set_auto_parallel_context(all_reduce_fusion_config=[29, 58, 89])
            rank = get_rank()

    mindrecord_file = create_mindrecord(args_opt.dataset, "ssd.mindrecord", True)

    if args_opt.only_create_dataset:
        return

    loss_scale = float(args_opt.loss_scale)
    if args_opt.run_platform == "CPU":
        loss_scale = 1.0

    # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0.
    use_multiprocessing = (args_opt.run_platform != "CPU")
    dataset = create_ssd_dataset(mindrecord_file, repeat_num=1, batch_size=args_opt.batch_size,
                                 device_num=device_num, rank=rank, use_multiprocessing=use_multiprocessing)

    dataset_size = dataset.get_dataset_size()
    print(f"Create dataset done! dataset size is {dataset_size}")
    ssd = ssd_model_build(args_opt)
    if ("use_float16" in config and config.use_float16) or args_opt.run_platform == "GPU":
        ssd.to_float(dtype.float16)
    net = SSDWithLossCell(ssd, config)

    # checkpoint
    ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
    save_ckpt_path = './ckpt_' + str(rank) + '/'
    ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=save_ckpt_path, config=ckpt_config)

    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        if args_opt.filter_weight:
            filter_checkpoint_parameter_by_list(param_dict, config.checkpoint_filter_list)
        load_param_into_net(net, param_dict, True)

    lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
                       lr_init=config.lr_init, lr_end=config.lr_end_rate * args_opt.lr, lr_max=args_opt.lr,
                       warmup_epochs=config.warmup_epochs,
                       total_epochs=args_opt.epoch_size,
                       steps_per_epoch=dataset_size))

    if "use_global_norm" in config and config.use_global_norm:
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                          config.momentum, config.weight_decay, 1.0)
        net = TrainingWrapper(net, opt, loss_scale, True)
    else:
        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                          config.momentum, config.weight_decay, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)


    callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
    model = Model(net)
    dataset_sink_mode = False
    if args_opt.mode == "sink" and args_opt.run_platform != "CPU":
        print("In sink mode, one epoch return a loss.")
        dataset_sink_mode = True
    print("Start train SSD, the first epoch will be slower because of the graph compilation.")
    model.train(args_opt.epoch_size, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
Esempio n. 7
0
def run_transformer_train():
    """
    Transformer training.
    """
    parser = argparse_init()
    args, _ = parser.parse_known_args()
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)
    context.set_context(save_graphs=True, reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)

    if args.distribute == "true":
        device_num = args.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          parameter_broadcast=True, device_num=device_num)
        D.init()
        rank_id = args.device_id % device_num
    else:
        device_num = 1
        rank_id = 0
    dataset, repeat_count = create_transformer_dataset(epoch_count=args.epoch_size, rank_size=device_num,
                                                       rank_id=rank_id, do_shuffle=args.do_shuffle,
                                                       enable_data_sink=args.enable_data_sink,
                                                       dataset_path=args.data_path)

    netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)

    if args.checkpoint_path:
        parameter_dict = load_checkpoint(args.checkpoint_path)
    else:
        parameter_dict = {}
        params = netwithloss.trainable_params()
        for param in params:
            name = param.name
            value = param.default_input
            if isinstance(value, Tensor):
                if name.endswith(".gamma"):
                    parameter_dict[name] = Parameter(one_weight(value.asnumpy().shape), name=name)
                elif name.endswith(".beta") or name.endswith(".bias"):
                    parameter_dict[name] = Parameter(zero_weight(value.asnumpy().shape), name=name)
                elif "embedding" in name:
                    parameter_dict[name] = Parameter(normal_weight(value.asnumpy().shape,
                                                                   transformer_net_cfg.hidden_size), name=name)
                else:
                    parameter_dict[name] = Parameter(weight_variable(value.asnumpy().shape), name=name)
    load_param_into_net(netwithloss, parameter_dict)

    lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
                                  training_steps=dataset.get_dataset_size()*args.epoch_size,
                                  learning_rate=cfg.lr_schedule.learning_rate,
                                  warmup_steps=cfg.lr_schedule.warmup_steps,
                                  hidden_size=transformer_net_cfg.hidden_size), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
    if args.enable_save_ckpt == "true":
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
                                       keep_checkpoint_max=args.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config)
        callbacks.append(ckpoint_cb)

    if args.enable_lossscale == "true":
        scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
                                                scale_factor=cfg.scale_factor,
                                                scale_window=cfg.scale_window)
        update_cell = scale_manager.get_update_cell()
        netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                                scale_update_cell=update_cell)
    else:
        netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer)

    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    model.train(repeat_count, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
Esempio n. 8
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--checkpoint_path",
                        type=str,
                        default="",
                        help="Checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)

    if args_opt.distribute == "true":
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        D.init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)

    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() *
                         ds.get_repeat_count(),
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    if args_opt.enable_save_ckpt == "true":
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.checkpoint_path:
        param_dict = load_checkpoint(args_opt.checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Esempio n. 9
0
        pad_num = int(np.ceil(cfg.embed_size / 16) * 16 - cfg.embed_size)
        if pad_num > 0:
            embedding_table = np.pad(embedding_table, [(0, 0), (0, pad_num)],
                                     'constant')
        cfg.embed_size = int(np.ceil(cfg.embed_size / 16) * 16)
    network = SentimentNet(vocab_size=embedding_table.shape[0],
                           embed_size=cfg.embed_size,
                           num_hiddens=cfg.num_hiddens,
                           num_layers=cfg.num_layers,
                           bidirectional=cfg.bidirectional,
                           num_classes=cfg.num_classes,
                           weight=Tensor(embedding_table),
                           batch_size=cfg.batch_size)
    # pre_trained
    if args.pre_trained:
        load_param_into_net(network, load_checkpoint(args.pre_trained))

    ds_train = lstm_create_dataset(args.preprocess_path,
                                   cfg.batch_size,
                                   1,
                                   device_num=device_num,
                                   rank=rank)

    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    if cfg.dynamic_lr:
        lr = Tensor(
            get_lr(global_step=cfg.global_step,
                   lr_init=cfg.lr_init,
                   lr_end=cfg.lr_end,
                   lr_max=cfg.lr_max,
                   warmup_epochs=cfg.warmup_epochs,
Esempio n. 10
0
def train():
    """training CenterNet"""
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(save_graphs=False)

    ckpt_save_dir = args_opt.save_checkpoint_path
    rank = 0
    device_num = 1
    num_workers = 8
    if args_opt.device_target == "Ascend":
        context.set_context(enable_auto_mixed_precision=False)
        context.set_context(device_id=args_opt.device_id)
        if args_opt.distribute == "true":
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(get_rank()) + '/'

            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                              device_num=device_num)
            _set_parallel_all_reduce_split()
    else:
        args_opt.distribute = "false"
        args_opt.need_profiler = "false"
        args_opt.enable_data_sink = "false"

    # Start create dataset!
    # mindrecord files will be generated at args_opt.mindrecord_dir such as centernet.mindrecord0, 1, ... file_num.
    logger.info("Begin creating dataset for CenterNet")
    coco = COCOHP(dataset_config, run_mode="train", net_opt=net_config, save_path=args_opt.save_result_dir)
    dataset = coco.create_train_dataset(args_opt.mindrecord_dir, args_opt.mindrecord_prefix,
                                        batch_size=train_config.batch_size, device_num=device_num, rank=rank,
                                        num_parallel_workers=num_workers, do_shuffle=args_opt.do_shuffle == 'true')
    dataset_size = dataset.get_dataset_size()
    logger.info("Create dataset done!")

    net_with_loss = CenterNetLossCell(net_config)

    args_opt.train_steps = args_opt.epoch_size * dataset_size
    logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(net_with_loss, dataset_size)

    enable_static_time = args_opt.device_target == "CPU"
    callback = [TimeMonitor(args_opt.data_sink_steps), LossCallBack(dataset_size, enable_static_time)]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(8, device_num) == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_centernet',
                                     directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)
    if args_opt.device_target == "Ascend":
        net_with_grads = CenterNetWithLossScaleCell(net_with_loss, optimizer=optimizer,
                                                    sens=train_config.loss_scale_value)
    else:
        net_with_grads = CenterNetWithoutLossScaleCell(net_with_loss, optimizer=optimizer)

    model = Model(net_with_grads)
    model.train(args_opt.epoch_size, dataset, callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps)
Esempio n. 11
0
def test(cloud_args=None):
    """
    network eval function. Get top1 and top5 ACC from classification.
    The result will be save at [./outputs] by default.
    """
    args = parse_args(cloud_args)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    if os.path.isdir(args.pretrained):
        models = list(glob.glob(os.path.join(args.pretrained, '*.ckpt')))

        f = lambda x: -1 * int(
            os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')
            [0])

        args.models = sorted(models, key=f)
    else:
        args.models = [
            args.pretrained,
        ]

    for model in args.models:
        de_dataset = classification_dataset(args.data_dir,
                                            image_size=args.image_size,
                                            per_batch_size=args.per_batch_size,
                                            max_epoch=1,
                                            rank=args.rank,
                                            group_size=args.group_size,
                                            mode='eval')
        eval_dataloader = de_dataset.create_tuple_iterator()
        network = DenseNet121(args.num_classes)

        param_dict = load_checkpoint(model)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(model))

        network.add_flags_recursive(fp16=True)

        img_tot = 0
        top1_correct = 0
        top5_correct = 0
        network.set_train(False)
        for data, gt_classes in eval_dataloader:
            output = network(Tensor(data, mstype.float32))
            output = output.asnumpy()
            gt_classes = gt_classes.asnumpy()

            top1_output = np.argmax(output, (-1))
            top5_output = np.argsort(output)[:, -5:]

            t1_correct = np.equal(top1_output, gt_classes).sum()
            top1_correct += t1_correct
            top5_correct += get_top5_acc(top5_output, gt_classes)
            img_tot += args.per_batch_size

        results = [[top1_correct], [top5_correct], [img_tot]]
        args.logger.info('before results={}'.format(results))
        if args.is_distributed:
            model_md5 = model.replace('/', '')
            tmp_dir = '../cache'
            if not os.path.exists(tmp_dir):
                os.mkdir(tmp_dir)
            top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                tmp_dir, args.rank, model_md5)
            np.save(top1_correct_npy, top1_correct)
            np.save(top5_correct_npy, top5_correct)
            np.save(img_tot_npy, img_tot)
            while True:
                rank_ok = True
                for other_rank in range(args.group_size):
                    top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                        tmp_dir, other_rank, model_md5)
                    if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) \
                       or not os.path.exists(img_tot_npy):
                        rank_ok = False
                if rank_ok:
                    break

            top1_correct_all = 0
            top5_correct_all = 0
            img_tot_all = 0
            for other_rank in range(args.group_size):
                top1_correct_npy = '{}/top1_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                top5_correct_npy = '{}/top5_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                img_tot_npy = '{}/img_tot_rank_{}_{}.npy'.format(
                    tmp_dir, other_rank, model_md5)
                top1_correct_all += np.load(top1_correct_npy)
                top5_correct_all += np.load(top5_correct_npy)
                img_tot_all += np.load(img_tot_npy)
            results = [[top1_correct_all], [top5_correct_all], [img_tot_all]]
            results = np.array(results)

        else:
            results = np.array(results)

        args.logger.info('after results={}'.format(results))
        top1_correct = results[0, 0]
        top5_correct = results[1, 0]
        img_tot = results[2, 0]
        acc1 = 100.0 * top1_correct / img_tot
        acc5 = 100.0 * top5_correct / img_tot
        args.logger.info(
            'after allreduce eval: top1_correct={}, tot={}, acc={:.2f}%'.
            format(top1_correct, img_tot, acc1))
        args.logger.info(
            'after allreduce eval: top5_correct={}, tot={}, acc={:.2f}%'.
            format(top5_correct, img_tot, acc5))
    if args.is_distributed:
        release()
Esempio n. 12
0
def test():
    """The function of eval."""
    start_time = time.time()
    args = parse_args()

    # logger
    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    rank_id = int(os.environ.get('RANK_ID'))
    args.logger = get_logger(args.outputs_dir, rank_id)

    context.reset_auto_parallel_context()
    parallel_mode = ParallelMode.STAND_ALONE
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      gradients_mean=True,
                                      device_num=1)

    args.logger.info('Creating Network....')
    network = YOLOV3DarkNet53(is_training=False)

    config = ConfigYOLOV3DarkNet53()
    if args.testing_shape:
        config.test_img_shape = conver_testing_shape(args)

    # convert fusion network to quantization aware network
    if config.quantization_aware:
        quantizer = QuantizationAwareTraining(bn_fold=True,
                                              per_channel=[True, False],
                                              symmetric=[True, False])
        network = quantizer.quantize(network)

    args.logger.info(args.pretrained)
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.pretrained))
    else:
        args.logger.info('{} not exists or not a pre-trained file'.format(
            args.pretrained))
        assert FileNotFoundError(
            '{} not exists or not a pre-trained file'.format(args.pretrained))
        exit(1)

    data_root = args.data_root
    ann_file = args.annFile

    ds, data_size = create_yolo_dataset(data_root,
                                        ann_file,
                                        is_training=False,
                                        batch_size=args.per_batch_size,
                                        max_epoch=1,
                                        device_num=1,
                                        rank=rank_id,
                                        shuffle=False,
                                        config=config)

    args.logger.info('testing shape : {}'.format(config.test_img_shape))
    args.logger.info('totol {} images to eval'.format(data_size))

    network.set_train(False)

    # init detection engine
    detection = DetectionEngine(args)

    input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
    args.logger.info('Start inference....')
    for i, data in enumerate(ds.create_dict_iterator(num_epochs=1)):
        image = data["image"]

        image_shape = data["image_shape"]
        image_id = data["img_id"]

        prediction = network(image, input_shape)
        output_big, output_me, output_small = prediction
        output_big = output_big.asnumpy()
        output_me = output_me.asnumpy()
        output_small = output_small.asnumpy()
        image_id = image_id.asnumpy()
        image_shape = image_shape.asnumpy()

        detection.detect([output_small, output_me, output_big],
                         args.per_batch_size, image_shape, image_id)
        if i % 1000 == 0:
            args.logger.info('Processing... {:.2f}% '.format(
                i * args.per_batch_size / data_size * 100))

    args.logger.info('Calculating mAP...')
    detection.do_nms_for_results()
    result_file_path = detection.write_result()
    args.logger.info('result file path: {}'.format(result_file_path))
    eval_result = detection.get_eval_result()

    cost_time = time.time() - start_time
    args.logger.info('\n=============coco eval reulst=========\n' +
                     eval_result)
    args.logger.info('testing cost time {:.2f}h'.format(cost_time / 3600.))
Esempio n. 13
0
def do_eval(dataset=None,
            network=None,
            metric=None,
            load_checkpoint_path="",
            eval_type=None,
            num_choice=None):
    """
    Do evaluation for CBT task.
    Args:
        dataset: the eval dataset.
        network:  the network with loss.
        metric: the evaluation method.
        load_checkpoint_path: the file path which saved finetuned model checkpoint.
        eval_type:
        num_choice:
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Finetune model missed, evaluation task must load finetune model!")

    if metric.lower() == "accuracy":
        print("Prepare to calculate the accuracy score ...")
        gpt2_cbt = network(config=gpt2_net_cfg,
                           is_training=False,
                           use_one_hot_embeddings=False)
        gpt2_cbt.set_train(False)
        param_dict = load_checkpoint(load_checkpoint_path)

        if eval_type == "zero-shot":
            final_param_dict = {}
            for name, _ in param_dict.items():
                final_param_dict['gpt2.gpt2.' + name] = param_dict[name]
            final_param_dict['gpt2.lm_head.weight'] = param_dict[
                'gpt2_embedding_lookup.embedding_table']
            load_param_into_net(gpt2_cbt, final_param_dict)
            print("load pretrained parameter successfully!\n")
        elif eval_type == "finetuned":
            load_param_into_net(gpt2_cbt, param_dict)
            print("load finetuned parameter successfully!\n")
        else:
            raise ValueError(
                "Evaluation type missed, eval_type should be [zero-shot, finetuned]"
            )

        model = Model(gpt2_cbt)
        callback = Accuracy()
        columns_list = ["input_ids", "input_mask", "input_length", "mc_labels"]
        print("==================== [ACC] Testing ====================")
        num_data = 1
        all_choice_prob = []

        for data in dataset.create_dict_iterator():
            input_data = []
            for i in columns_list:
                input_data.append(data[i])
            input_ids, input_mask, input_length, mc_labels = input_data
            print("| [ACC] number : {} / {} ".format(
                num_data, dataset.get_dataset_size()))
            # print("mc_labels: {}".format(mc_labels))  # [batch_size]

            logits = model.predict(input_ids, input_mask)
            # choice_prob_list [batch_size]
            choice_prob_list = calculate_choice_prob_for_cbt(
                logits=logits,
                batch_size=gpt2_net_cfg.batch_size,
                input_length=input_length,
                input_ids=input_ids)
            all_choice_prob.append(choice_prob_list)
            if (num_data * gpt2_net_cfg.batch_size) % num_choice == 0:
                all_choice_prob_np = np.array(all_choice_prob)
                all_choice_prob_np = all_choice_prob_np.reshape(
                    (-1, num_choice))
                print("| all_choice_prob_np: ", all_choice_prob_np)
                print("| all_choice_prob_np shape: ", all_choice_prob_np.shape)
                mc_labels = np.array([mc_labels.asnumpy()[0]])
                callback.update(all_choice_prob_np, mc_labels)
                all_choice_prob = []
            num_data += 1

        print("\n\n")
        print("**************************************************************")
        print("acc_num {} , total_num {}, accuracy {:.6f}".format(
            callback.acc_num, callback.total_num,
            callback.acc_num / callback.total_num))
        print("********************** Testing Finished **********************")
    else:
        raise ValueError("metric method not supported, support: [Accuracy]")
Esempio n. 14
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)

    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False)

    if args.device_target == 'Ascend':
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=devid)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, args.max_epoch,
                                        args.rank, args.group_size)
    de_dataset.map_model = 4
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = DenseNet121(args.num_classes)
    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    criterion = CrossEntropy(smooth_factor=args.label_smooth_factor,
                             num_classes=args.num_classes)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr_scheduler = MultiStepLR(args.lr,
                                   args.lr_epochs,
                                   args.lr_gamma,
                                   args.steps_per_epoch,
                                   args.max_epoch,
                                   warmup_epochs=args.warmup_epochs)
    elif args.lr_scheduler == 'cosine_annealing':
        lr_scheduler = CosineAnnealingLR(args.lr,
                                         args.T_max,
                                         args.steps_per_epoch,
                                         args.max_epoch,
                                         warmup_epochs=args.warmup_epochs,
                                         eta_min=args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)
    lr_schedule = lr_scheduler.get_lr()

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr_schedule),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    # mixed precision training
    criterion.add_flags_recursive(fp32=True)

    # package training process, adjust lr + forward + backward + optimizer
    train_net = BuildTrainNetwork(network, criterion)
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                     scale_factor=2,
                                                     scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)

    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      device_num=args.group_size,
                                      gradients_mean=True)

    if args.device_target == 'Ascend':
        model = Model(train_net,
                      optimizer=opt,
                      metrics=None,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O3")
    elif args.device_target == 'GPU':
        model = Model(train_net,
                      optimizer=opt,
                      metrics=None,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O0")
    else:
        raise ValueError("Unsupported device target.")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval,
            keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks)
Esempio n. 15
0
def main():
    args_opt = get_args()
    rank = 0
    device_num = 1
    if args_opt.run_platform == "CPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
    else:
        context.set_context(mode=context.GRAPH_MODE,
                            device_target=args_opt.run_platform,
                            device_id=args_opt.device_id)
        if args_opt.distribute:
            device_num = args_opt.device_num
            context.reset_auto_parallel_context()
            context.set_auto_parallel_context(
                parallel_mode=ParallelMode.DATA_PARALLEL,
                gradients_mean=True,
                device_num=device_num)
            init()
            if config.model == "ssd_resnet50_fpn":
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[90, 183, 279])
            if config.model == "ssd_vgg16":
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[20, 41, 62])
            else:
                context.set_auto_parallel_context(
                    all_reduce_fusion_config=[29, 58, 89])
            rank = get_rank()

    mindrecord_file = create_mindrecord(args_opt.dataset, "ssd.mindrecord",
                                        True)

    if args_opt.only_create_dataset:
        return

    loss_scale = float(args_opt.loss_scale)
    if args_opt.run_platform == "CPU":
        loss_scale = 1.0

    # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0.
    use_multiprocessing = (args_opt.run_platform != "CPU")
    dataset = create_ssd_dataset(mindrecord_file,
                                 repeat_num=1,
                                 batch_size=args_opt.batch_size,
                                 device_num=device_num,
                                 rank=rank,
                                 use_multiprocessing=use_multiprocessing)

    dataset_size = dataset.get_dataset_size()
    print(f"Create dataset done! dataset size is {dataset_size}")
    ssd = ssd_model_build(args_opt)
    if ("use_float16" in config
            and config.use_float16) or args_opt.run_platform == "GPU":
        ssd.to_float(dtype.float16)
    net = SSDWithLossCell(ssd, config)

    # checkpoint
    ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size *
                                   args_opt.save_checkpoint_epochs)
    save_ckpt_path = './ckpt_' + str(rank) + '/'
    ckpoint_cb = ModelCheckpoint(prefix="ssd",
                                 directory=save_ckpt_path,
                                 config=ckpt_config)

    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        if args_opt.filter_weight:
            filter_checkpoint_parameter_by_list(param_dict,
                                                config.checkpoint_filter_list)
        load_param_into_net(net, param_dict, True)

    lr = Tensor(
        get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
               lr_init=config.lr_init,
               lr_end=config.lr_end_rate * args_opt.lr,
               lr_max=args_opt.lr,
               warmup_epochs=config.warmup_epochs,
               total_epochs=args_opt.epoch_size,
               steps_per_epoch=dataset_size))

    if "use_global_norm" in config and config.use_global_norm:
        opt = nn.Momentum(
            filter(lambda x: x.requires_grad, net.get_parameters()), lr,
            config.momentum, config.weight_decay, 1.0)
        net = TrainingWrapper(net, opt, loss_scale, True)
    else:
        opt = nn.Momentum(
            filter(lambda x: x.requires_grad, net.get_parameters()), lr,
            config.momentum, config.weight_decay, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

    callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
    if args_opt.run_eval:
        eval_net = SsdInferWithDecoder(ssd, Tensor(default_boxes), config)
        eval_net.set_train(False)
        mindrecord_file = create_mindrecord(args_opt.dataset,
                                            "ssd_eval.mindrecord", False)
        eval_dataset = create_ssd_dataset(mindrecord_file,
                                          batch_size=args_opt.batch_size,
                                          repeat_num=1,
                                          is_training=False,
                                          use_multiprocessing=False)
        if args_opt.dataset == "coco":
            anno_json = os.path.join(
                config.coco_root,
                config.instances_set.format(config.val_data_type))
        elif args_opt.dataset == "voc":
            anno_json = os.path.join(config.voc_root, config.voc_json)
        else:
            raise ValueError(
                'SSD eval only support dataset mode is coco and voc!')
        eval_param_dict = {
            "net": eval_net,
            "dataset": eval_dataset,
            "anno_json": anno_json
        }
        eval_cb = EvalCallBack(apply_eval,
                               eval_param_dict,
                               interval=args_opt.eval_interval,
                               eval_start_epoch=args_opt.eval_start_epoch,
                               save_best_ckpt=True,
                               ckpt_directory=save_ckpt_path,
                               besk_ckpt_name="best_map.ckpt",
                               metrics_name="mAP")
        callback.append(eval_cb)
    model = Model(net)
    dataset_sink_mode = False
    if args_opt.mode == "sink" and args_opt.run_platform != "CPU":
        print("In sink mode, one epoch return a loss.")
        dataset_sink_mode = True
    print(
        "Start train SSD, the first epoch will be slower because of the graph compilation."
    )
    model.train(args_opt.epoch_size,
                dataset,
                callbacks=callback,
                dataset_sink_mode=dataset_sink_mode)
Esempio n. 16
0
def test(cloud_args=None):
    """test"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.device_target,
                        save_graphs=False)
    if os.getenv('DEVICE_ID',
                 "not_set").isdigit() and args.device_target == "Ascend":
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)

    if args.dataset == "cifar10":
        net = vgg16(num_classes=args.num_classes, args=args)
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                       0.01,
                       args.momentum,
                       weight_decay=args.weight_decay)
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

        param_dict = load_checkpoint(args.pre_trained)
        load_param_into_net(net, param_dict)
        net.set_train(False)
        dataset = vgg_create_dataset(args.data_path,
                                     args.image_size,
                                     args.per_batch_size,
                                     training=False)
        res = model.eval(dataset)
        print("result: ", res)
    else:
        # network
        args.logger.important_info('start create network')
        if os.path.isdir(args.pre_trained):
            models = list(glob.glob(os.path.join(args.pre_trained, '*.ckpt')))
            print(models)
            if args.graph_ckpt:
                f = lambda x: -1 * int(
                    os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].
                    split('_')[0])
            else:
                f = lambda x: -1 * int(
                    os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
            args.models = sorted(models, key=f)
        else:
            args.models = [
                args.pre_trained,
            ]

        for model in args.models:
            dataset = classification_dataset(args.data_path,
                                             args.image_size,
                                             args.per_batch_size,
                                             mode='eval')
            eval_dataloader = dataset.create_tuple_iterator(output_numpy=True,
                                                            num_epochs=1)
            network = vgg16(args.num_classes, args, phase="test")

            # pre_trained
            load_param_into_net(network, load_checkpoint(model))
            network.add_flags_recursive(fp16=True)

            img_tot = 0
            top1_correct = 0
            top5_correct = 0

            network.set_train(False)
            t_end = time.time()
            it = 0
            for data, gt_classes in eval_dataloader:
                output = network(Tensor(data, mstype.float32))
                output = output.asnumpy()

                top1_output = np.argmax(output, (-1))
                top5_output = np.argsort(output)[:, -5:]

                t1_correct = np.equal(top1_output, gt_classes).sum()
                top1_correct += t1_correct
                top5_correct += get_top5_acc(top5_output, gt_classes)
                img_tot += args.per_batch_size

                if args.rank == 0 and it == 0:
                    t_end = time.time()
                    it = 1
            if args.rank == 0:
                time_used = time.time() - t_end
                fps = (img_tot -
                       args.per_batch_size) * args.group_size / time_used
                args.logger.info(
                    'Inference Performance: {:.2f} img/sec'.format(fps))
            results = [[top1_correct], [top5_correct], [img_tot]]
            args.logger.info('before results={}'.format(results))
            results = np.array(results)

            args.logger.info('after results={}'.format(results))
            top1_correct = results[0, 0]
            top5_correct = results[1, 0]
            img_tot = results[2, 0]
            acc1 = 100.0 * top1_correct / img_tot
            acc5 = 100.0 * top5_correct / img_tot
            args.logger.info('after allreduce eval: top1_correct={}, tot={},'
                             'acc={:.2f}%(TOP1)'.format(
                                 top1_correct, img_tot, acc1))
            args.logger.info('after allreduce eval: top5_correct={}, tot={},'
                             'acc={:.2f}%(TOP5)'.format(
                                 top5_correct, img_tot, acc5))
Esempio n. 17
0
def run_transformer_train():
    """
    Transformer training.
    """
    parser = argparse_init()
    args, _ = parser.parse_known_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args.device_id)
    context.set_context(reserve_class_name_in_scope=False,
                        enable_auto_mixed_precision=False)

    if args.distribute == "true":
        device_num = args.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            parameter_broadcast=True,
            device_num=device_num)
        D.init()
        rank_id = args.device_id % device_num
    else:
        device_num = 1
        rank_id = 0
    dataset = create_transformer_dataset(
        epoch_count=1,
        rank_size=device_num,
        rank_id=rank_id,
        do_shuffle=args.do_shuffle,
        enable_data_sink=args.enable_data_sink,
        dataset_path=args.data_path)

    netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)

    if args.checkpoint_path:
        parameter_dict = load_checkpoint(args.checkpoint_path)
        load_param_into_net(netwithloss, parameter_dict)

    lr = Tensor(
        create_dynamic_lr(
            schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
            training_steps=dataset.get_dataset_size() * args.epoch_size,
            learning_rate=cfg.lr_schedule.learning_rate,
            warmup_steps=cfg.lr_schedule.warmup_steps,
            hidden_size=transformer_net_cfg.hidden_size,
            start_decay_step=cfg.lr_schedule.start_decay_step,
            min_lr=cfg.lr_schedule.min_lr), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
    if args.enable_save_ckpt == "true":
        if device_num == 1 or (device_num > 1 and rank_id == 0):
            ckpt_config = CheckpointConfig(
                save_checkpoint_steps=args.save_checkpoint_steps,
                keep_checkpoint_max=args.save_checkpoint_num)
            ckpoint_cb = ModelCheckpoint(prefix='transformer',
                                         directory=args.save_checkpoint_path,
                                         config=ckpt_config)
            callbacks.append(ckpoint_cb)

    if args.enable_lossscale == "true":
        scale_manager = DynamicLossScaleManager(
            init_loss_scale=cfg.init_loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        update_cell = scale_manager.get_update_cell()
        netwithgrads = TransformerTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = TransformerTrainOneStepCell(netwithloss,
                                                   optimizer=optimizer)

    netwithgrads.set_train(True)
    model = Model(netwithgrads)

    enable_sink = (args.enable_data_sink == "true")
    if enable_sink:
        sink_size = args.save_checkpoint_steps
        model.train(args.epoch_size * dataset.get_dataset_size() // sink_size,
                    dataset,
                    callbacks=callbacks,
                    dataset_sink_mode=enable_sink,
                    sink_size=sink_size)
    else:
        model.train(args.epoch_size,
                    dataset,
                    callbacks=callbacks,
                    dataset_sink_mode=enable_sink)
Esempio n. 18
0
                        help='checkpoint of inceptionV4')
    args_opt = parser.parse_args()
    return args_opt


if __name__ == '__main__':
    args = parse_args()

    if args.platform == 'Ascend':
        device_id = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=device_id)

    context.set_context(mode=context.GRAPH_MODE, device_target=args.platform)
    net = Inceptionv4(classes=config.num_classes)
    ckpt = load_checkpoint(args.checkpoint_path)
    load_param_into_net(net, ckpt)
    net.set_train(False)
    dataset = create_dataset(dataset_path=args.dataset_path,
                             do_train=False,
                             repeat_num=1,
                             batch_size=config.batch_size)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    eval_metrics = {
        'Loss': nn.Loss(),
        'Top1-Acc': nn.Top1CategoricalAccuracy(),
        'Top5-Acc': nn.Top5CategoricalAccuracy()
    }
    model = Model(net, loss, optimizer=None, metrics=eval_metrics)
    print('=' * 20, 'Evalute start', '=' * 20)
    metrics = model.eval(dataset)
    print("metric: ", metrics)
Esempio n. 19
0
def test_train():
    '''
    finetune function
    '''
    devid = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=devid)
    #BertCLSTrain for classification
    #BertNERTrain for sequence labeling
    if cfg.task == 'NER':
        if cfg.use_crf:
            netwithloss = BertNER(bert_net_cfg,
                                  True,
                                  num_labels=len(tag_to_index),
                                  use_crf=True,
                                  tag_to_index=tag_to_index,
                                  dropout_prob=0.1)
        else:
            netwithloss = BertNER(bert_net_cfg,
                                  True,
                                  num_labels=cfg.num_labels,
                                  dropout_prob=0.1)
    elif cfg.task == 'SQUAD':
        netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)
    else:
        netwithloss = BertCLS(bert_net_cfg,
                              True,
                              num_labels=cfg.num_labels,
                              dropout_prob=0.1)
    if cfg.task == 'SQUAD':
        dataset = get_squad_dataset(bert_net_cfg.batch_size, cfg.epoch_num)
    else:
        dataset = get_dataset(bert_net_cfg.batch_size, cfg.epoch_num)
    # optimizer
    steps_per_epoch = dataset.get_dataset_size()
    if cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=steps_per_epoch * cfg.epoch_num,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            warmup_steps=int(steps_per_epoch * cfg.epoch_num * 0.1),
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps)
    elif cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=steps_per_epoch * cfg.epoch_num,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         weight_decay=cfg.Lamb.weight_decay,
                         warmup_steps=int(steps_per_epoch * cfg.epoch_num *
                                          0.1),
                         decay_filter=cfg.Lamb.decay_filter)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    else:
        raise Exception("Optimizer not supported.")
    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix,
                                 directory=cfg.ckpt_dir,
                                 config=ckpt_config)
    param_dict = load_checkpoint(cfg.pre_training_ckpt)
    load_param_into_net(netwithloss, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    if cfg.task == 'SQUAD':
        netwithgrads = BertSquadCell(netwithloss,
                                     optimizer=optimizer,
                                     scale_update_cell=update_cell)
    else:
        netwithgrads = BertFinetuneCell(netwithloss,
                                        optimizer=optimizer,
                                        scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(cfg.epoch_num, dataset, callbacks=[LossCallBack(), ckpoint_cb])
def test_salt_and_pepper_attack_on_mnist():
    """
    Salt-and-Pepper-Attack test
    """
    # upload trained network
    ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt'
    net = LeNet5()
    load_dict = load_checkpoint(ckpt_name)
    load_param_into_net(net, load_dict)

    # get test data
    data_list = "./MNIST_unzip/test"
    batch_size = 32
    ds = generate_mnist_dataset(data_list, batch_size=batch_size)

    # prediction accuracy before attack
    model = ModelToBeAttacked(net)
    batch_num = 3  # the number of batches of attacking samples
    test_images = []
    test_labels = []
    predict_labels = []
    i = 0
    for data in ds.create_tuple_iterator():
        i += 1
        images = data[0].astype(np.float32)
        labels = data[1]
        test_images.append(images)
        test_labels.append(labels)
        pred_labels = np.argmax(model.predict(images), axis=1)
        predict_labels.append(pred_labels)
        if i >= batch_num:
            break
    LOGGER.debug(
        TAG,
        'model input image shape is: {}'.format(np.array(test_images).shape))
    predict_labels = np.concatenate(predict_labels)
    true_labels = np.concatenate(test_labels)
    accuracy = np.mean(np.equal(predict_labels, true_labels))
    LOGGER.info(TAG, "prediction accuracy before attacking is : %g", accuracy)

    # attacking
    is_target = False
    attack = SaltAndPepperNoiseAttack(model=model,
                                      is_targeted=is_target,
                                      sparse=True)
    if is_target:
        targeted_labels = np.random.randint(0, 10, size=len(true_labels))
        for i in range(len(true_labels)):
            if targeted_labels[i] == true_labels[i]:
                targeted_labels[i] = (targeted_labels[i] + 1) % 10
    else:
        targeted_labels = true_labels
    LOGGER.debug(
        TAG, 'input shape is: {}'.format(np.concatenate(test_images).shape))
    success_list, adv_data, query_list = attack.generate(
        np.concatenate(test_images), targeted_labels)
    success_list = np.arange(success_list.shape[0])[success_list]
    LOGGER.info(TAG, 'success_list: %s', success_list)
    LOGGER.info(TAG, 'average of query times is : %s', np.mean(query_list))
    adv_preds = []
    for ite_data in adv_data:
        pred_logits_adv = model.predict(ite_data)
        # rescale predict confidences into (0, 1).
        pred_logits_adv = softmax(pred_logits_adv, axis=1)
        adv_preds.extend(pred_logits_adv)
    accuracy_adv = np.mean(np.equal(np.max(adv_preds, axis=1), true_labels))
    LOGGER.info(TAG, "prediction accuracy after attacking is : %g",
                accuracy_adv)
    test_labels_onehot = np.eye(10)[true_labels]
    attack_evaluate = AttackEvaluate(np.concatenate(test_images),
                                     test_labels_onehot,
                                     adv_data,
                                     adv_preds,
                                     targeted=is_target,
                                     target_label=targeted_labels)
    LOGGER.info(TAG, 'mis-classification rate of adversaries is : %s',
                attack_evaluate.mis_classification_rate())
    LOGGER.info(TAG, 'The average confidence of adversarial class is : %s',
                attack_evaluate.avg_conf_adv_class())
    LOGGER.info(TAG, 'The average confidence of true class is : %s',
                attack_evaluate.avg_conf_true_class())
    LOGGER.info(
        TAG, 'The average distance (l0, l2, linf) between original '
        'samples and adversarial samples are: %s',
        attack_evaluate.avg_lp_distance())
Esempio n. 21
0
        audio_conf=config.DataConfig.SpectConfig,
        bidirectional=True,
        device_target=args.device_target)

    loss_net = NetWithLossClass(deepspeech_net)
    weights = ParameterTuple(deepspeech_net.trainable_params())

    optimizer = Adam(weights,
                     learning_rate=config.OptimConfig.learning_rate,
                     eps=config.OptimConfig.epsilon,
                     loss_scale=config.OptimConfig.loss_scale)
    train_net = TrainOneStepCell(loss_net, optimizer)
    train_net.set_train(True)
    if args.pre_trained_model_path != '':
        param_dict = load_checkpoint(args.pre_trained_model_path)
        load_param_into_net(train_net, param_dict)
        print('Successfully loading the pre-trained model')

    model = Model(train_net)
    callback_list = [TimeMonitor(steps_size), LossMonitor()]

    if args.is_distributed:
        config.CheckpointConfig.ckpt_file_name_prefix = config.CheckpointConfig.ckpt_file_name_prefix + str(
            get_rank())
        config.CheckpointConfig.ckpt_path = os.path.join(
            config.CheckpointConfig.ckpt_path, 'ckpt_' + str(get_rank()) + '/')
    config_ck = CheckpointConfig(
        save_checkpoint_steps=1,
        keep_checkpoint_max=config.CheckpointConfig.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(
        prefix=config.CheckpointConfig.ckpt_file_name_prefix,
Esempio n. 22
0
def _build_training_pipeline(pre_dataset):
    """
    Build training pipeline

    Args:
        pre_dataset: preprocessed dataset
    """
    net_with_loss = FastTextNetWithLoss(config.vocab_size,
                                        config.embedding_dims,
                                        config.num_class)
    net_with_loss.init_parameters_data()
    if config.pretrain_ckpt_dir:
        parameter_dict = load_checkpoint(config.pretrain_ckpt_dir)
        load_param_into_net(net_with_loss, parameter_dict)
    if pre_dataset is None:
        raise ValueError("pre-process dataset must be provided")

    #get learning rate
    update_steps = config.epoch * pre_dataset.get_dataset_size()
    decay_steps = pre_dataset.get_dataset_size()
    rank_size = os.getenv("RANK_SIZE")
    if isinstance(rank_size, int):
        raise ValueError("RANK_SIZE must be integer")
    if rank_size is not None and int(rank_size) > 1:
        base_lr = config.lr
    else:
        base_lr = config.lr / 10
    print("+++++++++++Total update steps ", update_steps)
    lr = Tensor(polynomial_decay_scheduler(
        lr=base_lr,
        min_lr=config.min_lr,
        decay_steps=decay_steps,
        total_update_num=update_steps,
        warmup_steps=config.warmup_steps,
        power=config.poly_lr_scheduler_power),
                dtype=mstype.float32)
    optimizer = Adam(net_with_loss.trainable_params(),
                     lr,
                     beta1=0.9,
                     beta2=0.999)

    net_with_grads = FastTextTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)
    net_with_grads.set_train(True)
    model = Model(net_with_grads)
    loss_monitor = LossCallBack(rank_ids=rank_id)
    dataset_size = pre_dataset.get_dataset_size()
    time_monitor = TimeMonitor(data_size=dataset_size)
    ckpt_config = CheckpointConfig(save_checkpoint_steps=decay_steps *
                                   config.epoch,
                                   keep_checkpoint_max=config.keep_ckpt_max)
    callbacks = [time_monitor, loss_monitor]
    if rank_size is None or int(rank_size) == 1:
        ckpt_callback = ModelCheckpoint(
            prefix='fasttext',
            directory=os.path.join('./',
                                   'ckpt_{}'.format(os.getenv("DEVICE_ID"))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)
    if rank_size is not None and int(
            rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
        ckpt_callback = ModelCheckpoint(
            prefix='fasttext',
            directory=os.path.join('./',
                                   'ckpt_{}'.format(os.getenv("DEVICE_ID"))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)
    print("Prepare to Training....")
    epoch_size = pre_dataset.get_repeat_count()
    print("Epoch size ", epoch_size)
    if os.getenv("RANK_SIZE") is not None and int(os.getenv("RANK_SIZE")) > 1:
        print(f" | Rank {MultiAscend.get_rank()} Call model train.")
    model.train(epoch=config.epoch,
                train_dataset=pre_dataset,
                callbacks=callbacks,
                dataset_sink_mode=False)
Esempio n. 23
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--train_steps",
                        type=int,
                        default=-1,
                        help="Training Steps, default is -1, "
                        "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)

    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() * new_repeat_count,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * new_repeat_count,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    if args_opt.enable_save_ckpt == "true":
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=ckpt_save_dir,
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Esempio n. 24
0
    network = quant.convert_quant_network(network,
                                          bn_fold=True,
                                          per_channel=[True, False],
                                          symmetric=[True, False])
    # define network loss
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')

    # define dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=False,
                             config=config_device_target,
                             device_target=args_opt.device_target,
                             batch_size=config_device_target.batch_size)
    step_size = dataset.get_dataset_size()

    # load checkpoint
    if args_opt.checkpoint_path:
        param_dict = load_checkpoint(args_opt.checkpoint_path)
        not_load_param = load_param_into_net(network, param_dict)
        if not_load_param:
            raise ValueError("Load param into net fail!")
    network.set_train(False)

    # define model
    model = Model(network, loss_fn=loss, metrics={'acc'})

    print("============== Starting Validation ==============")
    res = model.eval(dataset)
    print("result:", res, "ckpt=", args_opt.checkpoint_path)
    print("============== End Validation ==============")
Esempio n. 25
0
def val(args):
    '''eval'''
    print('=============yolov3 start evaluating==================')

    # logger
    args.batch_size = config.batch_size
    args.input_shape = config.input_shape
    args.result_path = config.result_path
    args.conf_thresh = config.conf_thresh
    args.nms_thresh = config.nms_thresh

    context.set_auto_parallel_context(parallel_mode=ParallelMode.STAND_ALONE,
                                      device_num=args.world_size,
                                      gradients_mean=True)
    mindrecord_path = args.mindrecord_path
    print('Loading data from {}'.format(mindrecord_path))

    num_classes = config.num_classes
    if num_classes > 1:
        raise NotImplementedError(
            'num_classes > 1: Yolov3 postprocess not implemented!')

    anchors = config.anchors
    anchors_mask = config.anchors_mask
    num_anchors_list = [len(x) for x in anchors_mask]

    reduction_0 = 64.0
    reduction_1 = 32.0
    reduction_2 = 16.0
    labels = ['face']
    classes = {0: 'face'}

    # dataloader
    ds = de.MindDataset(
        mindrecord_path + "0",
        columns_list=["image", "annotation", "image_name", "image_size"])

    single_scale_trans = SingleScaleTrans(resize=args.input_shape)

    ds = ds.batch(
        args.batch_size,
        per_batch_map=single_scale_trans,
        input_columns=["image", "annotation", "image_name", "image_size"],
        num_parallel_workers=8)

    args.steps_per_epoch = ds.get_dataset_size()

    # backbone
    network = backbone_HwYolov3(num_classes, num_anchors_list, args)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        print('load model {} success'.format(args.pretrained))
    else:
        print(
            'load model {} failed, please check the path of model, evaluating end'
            .format(args.pretrained))
        exit(0)

    ds = ds.repeat(1)

    det = {}
    img_size = {}
    img_anno = {}

    model_name = args.pretrained.split('/')[-1].replace('.ckpt', '')
    result_path = os.path.join(args.result_path, model_name)
    if os.path.exists(result_path):
        pass
    if not os.path.isdir(result_path):
        os.makedirs(result_path, exist_ok=True)

    # result file
    ret_files_set = {
        'face': os.path.join(result_path, 'comp4_det_test_face_rm5050.txt'),
    }

    test_net = BuildTestNetwork(network, reduction_0, reduction_1, reduction_2,
                                anchors, anchors_mask, num_classes, args)

    print('conf_thresh:', args.conf_thresh)

    eval_times = 0

    for data in ds.create_tuple_iterator(output_numpy=True):
        batch_images = data[0]
        batch_labels = data[1]
        batch_image_name = data[2]
        batch_image_size = data[3]
        eval_times += 1

        img_tensor = Tensor(batch_images, mstype.float32)

        dets = []
        tdets = []

        coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2, cls_scores_2 = test_net(
            img_tensor)

        boxes_0, boxes_1, boxes_2 = get_bounding_boxes(
            coords_0, cls_scores_0, coords_1, cls_scores_1, coords_2,
            cls_scores_2, args.conf_thresh, args.input_shape, num_classes)

        converted_boxes_0, converted_boxes_1, converted_boxes_2 = tensor_to_brambox(
            boxes_0, boxes_1, boxes_2, args.input_shape, labels)

        tdets.append(converted_boxes_0)
        tdets.append(converted_boxes_1)
        tdets.append(converted_boxes_2)

        batch = len(tdets[0])
        for b in range(batch):
            single_dets = []
            for op in range(3):
                single_dets.extend(tdets[op][b])
            dets.append(single_dets)

        det.update({
            batch_image_name[k].decode('UTF-8'): v
            for k, v in enumerate(dets)
        })
        img_size.update({
            batch_image_name[k].decode('UTF-8'): v
            for k, v in enumerate(batch_image_size)
        })
        img_anno.update({
            batch_image_name[k].decode('UTF-8'): v
            for k, v in enumerate(batch_labels)
        })

    print('eval times:', eval_times)
    print('batch size: ', args.batch_size)

    netw, neth = args.input_shape
    reorg_dets = voc_wrapper.reorg_detection(det, netw, neth, img_size)
    voc_wrapper.gen_results(reorg_dets, result_path, img_size, args.nms_thresh)

    # compute mAP
    ground_truth = parse_gt_from_anno(img_anno, classes)

    ret_list = parse_rets(ret_files_set)
    iou_thr = 0.5
    evaluate = calc_recall_precision_ap(ground_truth, ret_list, iou_thr)

    aps_str = ''
    for cls in evaluate:
        per_line, = plt.plot(evaluate[cls]['recall'],
                             evaluate[cls]['presicion'], 'b-')
        per_line.set_label('%s:AP=%.3f' % (cls, evaluate[cls]['ap']))
        aps_str += '_%s_AP_%.3f' % (cls, evaluate[cls]['ap'])
        plt.plot([i / 1000.0 for i in range(1, 1001)],
                 [i / 1000.0 for i in range(1, 1001)], 'y--')
        plt.axis([0, 1.2, 0, 1.2])
        plt.xlabel('recall')
        plt.ylabel('precision')
        plt.grid()

        plt.legend()
        plt.title('PR')

    # save mAP
    ap_save_path = os.path.join(
        result_path,
        result_path.replace('/', '_') + aps_str + '.png')
    print('Saving {}'.format(ap_save_path))
    plt.savefig(ap_save_path)

    print('=============yolov3 evaluating finished==================')
Esempio n. 26
0
def train():
    args = parse_args()
    cfg = FCN8s_VOC2012_cfg
    device_num = int(os.environ.get("DEVICE_NUM", 1))
    # init multicards training
    if device_num > 1:
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=device_num)
        init()
    args.rank = get_rank()
    args.group_size = get_group_size()

    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, save_graphs=False,
                        device_target="Ascend", device_id=args.device_id)

    # dataset
    dataset = data_generator.SegDataset(image_mean=cfg.image_mean,
                                        image_std=cfg.image_std,
                                        data_file=cfg.data_file,
                                        batch_size=cfg.batch_size,
                                        crop_size=cfg.crop_size,
                                        max_scale=cfg.max_scale,
                                        min_scale=cfg.min_scale,
                                        ignore_label=cfg.ignore_label,
                                        num_classes=cfg.num_classes,
                                        num_readers=2,
                                        num_parallel_calls=4,
                                        shard_id=args.rank,
                                        shard_num=args.group_size)
    dataset = dataset.get_dataset(repeat=1)

    net = FCN8s(n_class=cfg.num_classes)
    loss_ = loss.SoftmaxCrossEntropyLoss(cfg.num_classes, cfg.ignore_label)

    # load pretrained vgg16 parameters to init FCN8s
    if cfg.ckpt_vgg16:
        param_vgg = load_checkpoint(cfg.ckpt_vgg16)
        param_dict = {}
        for layer_id in range(1, 6):
            sub_layer_num = 2 if layer_id < 3 else 3
            for sub_layer_id in range(sub_layer_num):
                # conv param
                y_weight = 'conv{}.{}.weight'.format(layer_id, 3 * sub_layer_id)
                x_weight = 'vgg16_feature_extractor.conv{}_{}.0.weight'.format(layer_id, sub_layer_id + 1)
                param_dict[y_weight] = param_vgg[x_weight]
                # BatchNorm param
                y_gamma = 'conv{}.{}.gamma'.format(layer_id, 3 * sub_layer_id + 1)
                y_beta = 'conv{}.{}.beta'.format(layer_id, 3 * sub_layer_id + 1)
                x_gamma = 'vgg16_feature_extractor.conv{}_{}.1.gamma'.format(layer_id, sub_layer_id + 1)
                x_beta = 'vgg16_feature_extractor.conv{}_{}.1.beta'.format(layer_id, sub_layer_id + 1)
                param_dict[y_gamma] = param_vgg[x_gamma]
                param_dict[y_beta] = param_vgg[x_beta]
        load_param_into_net(net, param_dict)
    # load pretrained FCN8s
    elif cfg.ckpt_pre_trained:
        param_dict = load_checkpoint(cfg.ckpt_pre_trained)
        load_param_into_net(net, param_dict)

    # optimizer
    iters_per_epoch = dataset.get_dataset_size()

    lr_scheduler = CosineAnnealingLR(cfg.base_lr,
                                     cfg.train_epochs,
                                     iters_per_epoch,
                                     cfg.train_epochs,
                                     warmup_epochs=0,
                                     eta_min=0)
    lr = Tensor(lr_scheduler.get_lr())

    # loss scale
    manager_loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False)

    optimizer = nn.Momentum(params=net.trainable_params(), learning_rate=lr, momentum=0.9, weight_decay=0.0001,
                            loss_scale=cfg.loss_scale)

    model = Model(net, loss_fn=loss_, loss_scale_manager=manager_loss_scale, optimizer=optimizer, amp_level="O3")

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)
    loss_cb = LossMonitor()
    cbs = [time_cb, loss_cb]

    if args.rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_steps,
                                     keep_checkpoint_max=cfg.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix=cfg.model, directory=cfg.train_dir, config=config_ck)
        cbs.append(ckpoint_cb)

    model.train(cfg.train_epochs, dataset, callbacks=cbs)
Esempio n. 27
0
    args.logger.info('Creating Network....')
    network = YOLOV4CspDarkNet53(is_training=False)

    args.logger.info(args.pretrained)
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.pretrained))
    else:
        args.logger.info('{} not exists or not a pre-trained file'.format(
            args.pretrained))
        assert FileNotFoundError(
            '{} not exists or not a pre-trained file'.format(args.pretrained))
        exit(1)

    data_root = args.data_root
    ann_file = args.ann_file

    config = ConfigYOLOV4CspDarkNet53()
    if args.testing_shape:
        config.test_img_shape = convert_testing_shape(args.testing_shape)
Esempio n. 28
0
def test_nad_method():
    """
    NAD-Defense test.
    """
    # 1. load trained network
    ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt'
    net = LeNet5()
    load_dict = load_checkpoint(ckpt_name)
    load_param_into_net(net, load_dict)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=False)
    opt = nn.Momentum(net.trainable_params(), 0.01, 0.09)

    nad = NaturalAdversarialDefense(net,
                                    loss_fn=loss,
                                    optimizer=opt,
                                    bounds=(0.0, 1.0),
                                    eps=0.3)

    # 2. get test data
    data_list = "./MNIST_unzip/test"
    batch_size = 32
    ds_test = generate_mnist_dataset(data_list,
                                     batch_size=batch_size,
                                     sparse=False)
    inputs = []
    labels = []
    for data in ds_test.create_tuple_iterator():
        inputs.append(data[0].astype(np.float32))
        labels.append(data[1])
    inputs = np.concatenate(inputs)
    labels = np.concatenate(labels)

    # 3. get accuracy of test data on original model
    net.set_train(False)
    acc_list = []
    batchs = inputs.shape[0] // batch_size
    for i in range(batchs):
        batch_inputs = inputs[i * batch_size:(i + 1) * batch_size]
        batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size],
                                 axis=1)
        logits = net(Tensor(batch_inputs)).asnumpy()
        label_pred = np.argmax(logits, axis=1)
        acc_list.append(np.mean(batch_labels == label_pred))

    LOGGER.debug(TAG, 'accuracy of TEST data on original model is : %s',
                 np.mean(acc_list))

    # 4. get adv of test data
    attack = FastGradientSignMethod(net, eps=0.3)
    adv_data = attack.batch_generate(inputs, labels)
    LOGGER.debug(TAG, 'adv_data.shape is : %s', adv_data.shape)

    # 5. get accuracy of adv data on original model
    net.set_train(False)
    acc_list = []
    batchs = adv_data.shape[0] // batch_size
    for i in range(batchs):
        batch_inputs = adv_data[i * batch_size:(i + 1) * batch_size]
        batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size],
                                 axis=1)
        logits = net(Tensor(batch_inputs)).asnumpy()
        label_pred = np.argmax(logits, axis=1)
        acc_list.append(np.mean(batch_labels == label_pred))

    LOGGER.debug(TAG, 'accuracy of adv data on original model is : %s',
                 np.mean(acc_list))

    # 6. defense
    net.set_train()
    nad.batch_defense(inputs, labels, batch_size=32, epochs=10)

    # 7. get accuracy of test data on defensed model
    net.set_train(False)
    acc_list = []
    batchs = inputs.shape[0] // batch_size
    for i in range(batchs):
        batch_inputs = inputs[i * batch_size:(i + 1) * batch_size]
        batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size],
                                 axis=1)
        logits = net(Tensor(batch_inputs)).asnumpy()
        label_pred = np.argmax(logits, axis=1)
        acc_list.append(np.mean(batch_labels == label_pred))

    LOGGER.debug(TAG, 'accuracy of TEST data on defensed model is : %s',
                 np.mean(acc_list))

    # 8. get accuracy of adv data on defensed model
    acc_list = []
    batchs = adv_data.shape[0] // batch_size
    for i in range(batchs):
        batch_inputs = adv_data[i * batch_size:(i + 1) * batch_size]
        batch_labels = np.argmax(labels[i * batch_size:(i + 1) * batch_size],
                                 axis=1)
        logits = net(Tensor(batch_inputs)).asnumpy()
        label_pred = np.argmax(logits, axis=1)
        acc_list.append(np.mean(batch_labels == label_pred))

    LOGGER.debug(TAG, 'accuracy of adv data on defensed model is : %s',
                 np.mean(acc_list))
Esempio n. 29
0
def do_train(dataset=None,
             network=None,
             load_checkpoint_path="",
             save_checkpoint_path="",
             epoch_num=1):
    """
    Do train
    Args:
        dataset: the train dataset.
        network:  the network with loss
        load_checkpoint_path: the file path which saved pretrain model checkpoint.
        save_checkpoint_path:  the file path which will save finetune model checkpoint.
        epoch_num: the number of epoch
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Pretrain model missed, finetune task must load pretrain model!")

    steps_per_epoch = dataset.get_dataset_size()

    # optimizer
    if cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.AdamWeightDecay.power)
        params = network.trainable_params()

        decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }]
        optimizer = AdamWeightDecay(group_params,
                                    lr_schedule,
                                    eps=cfg.AdamWeightDecay.eps)
    elif cfg.optimizer == 'Lamb':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.Lamb.learning_rate,
            end_learning_rate=cfg.Lamb.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.Lamb.power)
        optimizer = Lamb(network.trainable_params(), lr_schedule)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(),
                             cfg.Momentum.learning_rate, cfg.Momentum.momentum)
    else:
        raise Exception(
            "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]"
        )

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    prefix_name = "gpt2_summarization_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \
                  + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size)
    ckpoint_cb = ModelCheckpoint(
        prefix=prefix_name,
        directory=None if save_checkpoint_path == "" else save_checkpoint_path,
        config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)

    final_param_dict = {}
    for name, _ in param_dict.items():
        final_param_dict['gpt2.gpt2.' + name] = param_dict[name]
    final_param_dict['gpt2.lm_head.weight'] = param_dict[
        'gpt2_embedding_lookup.embedding_table']

    load_param_into_net(network, final_param_dict)
    print("Load pretrained parameter successfully!\n")

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = GPT2FinetuneCell(network,
                                    optimizer=optimizer,
                                    scale_update_cell=update_cell)
    netwithgrads.set_train(True)

    loss_cb = LossMonitor(per_print_times=1)
    model = Model(netwithgrads)
    callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb]

    print("============== Starting Finetuning ==============")
    model.train(epoch_num,
                dataset,
                callbacks=callbacks,
                dataset_sink_mode=False)
    print("============== Finetuning Success ==============")
Esempio n. 30
0
        layers=hparams.layers,
        stacks=hparams.stacks,
        residual_channels=hparams.residual_channels,
        gate_channels=hparams.gate_channels,
        skip_out_channels=hparams.skip_out_channels,
        cin_channels=hparams.cin_channels,
        gin_channels=hparams.gin_channels,
        n_speakers=hparams.n_speakers,
        dropout=hparams.dropout,
        kernel_size=hparams.kernel_size,
        cin_pad=hparams.cin_pad,
        upsample_conditional_features=hparams.upsample_conditional_features,
        upsample_params=upsample_params,
        scalar_input=is_scalar_input(hparams.input_type),
        output_distribution=hparams.output_distribution,
    )

    Net = PredictNet(model)
    Net.set_train(False)
    receptive_field = model.receptive_field
    print("Receptive field (samples / ms): {} / {}".format(receptive_field, receptive_field / fs * 1000))
    param_dict = load_checkpoint(args.pretrain_ckpt)
    load_param_into_net(model, param_dict)
    print('Successfully loading the pre-trained model')

    x = np.array(np.random.random((2, 256, 10240)), dtype=np.float32)
    c = np.array(np.random.random((2, 80, 44)), dtype=np.float32)
    g = np.array([0, 0], dtype=np.int64)

    export(Net, Tensor(x), Tensor(c), Tensor(g), file_name="WaveNet", file_format='MINDIR')