def __init__(self, network, optimizer, sens=1.0):
        super(TransformerTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ", parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
Beispiel #2
0
def network_init(args):
    devid = int(os.getenv('DEVICE_ID', '0'))
    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.device_target, save_graphs=True, device_id=devid)
    # init distributed
    if args.is_distributed:
        if args.device_target == "Ascend":
            init()
        else:
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()
    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1
    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    args.logger.save_args(args)
Beispiel #3
0
 def __init__(self, network, optimizer, sens=1.0):
     super(BertEvaluationCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.sens = sens
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.hyper_map = C.HyperMap()
Beispiel #4
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertEvaluationWithLossScaleCell,
           self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_status = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(
             Tensor(scale_update_cell.get_loss_scale(),
                    dtype=mstype.float32))
Beispiel #5
0
 def __init__(self, D, optimizer, sens=1.0):
     super(TrainOneStepD, self).__init__(auto_prefix=False)
     self.optimizer = optimizer
     self.D = D
     self.D.set_grad()
     self.D.set_train()
     self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.weights = ms.ParameterTuple(D.trainable_params())
     self.reducer_flag = False
     self.grad_reducer = None
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
Beispiel #6
0
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.9,
                 affine=True,
                 gamma_init='ones',
                 beta_init='zeros',
                 moving_mean_init='zeros',
                 moving_var_init='ones',
                 use_batch_statistics=True,
                 device_num_each_group=1):
        super(_BatchNorm, self).__init__()
        if num_features < 1:
            raise ValueError("num_features must be at least 1")

        if momentum < 0 or momentum > 1:
            raise ValueError(
                "momentum should be a number in range [0, 1], but got {}".
                format(momentum))

        self.use_batch_statistics = use_batch_statistics
        self.num_features = num_features
        self.eps = eps
        self.moving_mean = Parameter(initializer(moving_mean_init,
                                                 num_features),
                                     name="mean",
                                     requires_grad=False)
        self.moving_variance = Parameter(initializer(moving_var_init,
                                                     num_features),
                                         name="variance",
                                         requires_grad=False)
        self.gamma = Parameter(initializer(gamma_init, num_features),
                               name="gamma",
                               requires_grad=affine)
        self.beta = Parameter(initializer(beta_init, num_features),
                              name="beta",
                              requires_grad=affine)
        self.group = check_int_positive(device_num_each_group)
        self.is_global = False
        if self.group != 1:
            self.rank_id = get_rank()
            self.rank_size = get_group_size()
            self.device_list = [i for i in range(0, self.rank_size)]
            self.rank_list = self.list_group(self.device_list, self.group)
            self.rank_list_idx = len(self.rank_list)
            for i in range(self.rank_list_idx):
                if self.rank_id in self.rank_list[i] and self.group != 1:
                    self.is_global = True
                    management.create_group('group' + str(i),
                                            self.rank_list[i])
                    self.all_reduce = P.AllReduce(
                        P.ReduceOp.SUM,
                        'group' + str(i)).add_prim_attr('fusion', 1)
        self.shape = P.Shape()
        self.reduce_mean = P.ReduceMean(keep_dims=True)
        self.square = P.Square()
        self.sqrt = P.Sqrt()
        self.cast = P.Cast()
        self.dtype = P.DType()
        self.reshape = P.Reshape()
        self.is_ascend = context.get_context("device_target") == "Ascend"

        if context.get_context("enable_ge"):
            self.is_ge_backend = True
            self.momentum = Tensor(1.0 - momentum, mstype.float32)
        else:
            self.is_ge_backend = False
            self.momentum = 1.0 - momentum
        if self.is_ge_backend or self.is_ascend:
            self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps)
        else:
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
        self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps)

        data_parallel_strategy = ((1, ), (1, ))
        data_parallel_strategy_one = ((1, ), ())
        self.sub_mean = P.Sub().set_strategy(data_parallel_strategy)
        self.sub_var = P.Sub().set_strategy(data_parallel_strategy)
        self.mul_mean = P.Mul().set_strategy(data_parallel_strategy_one)
        self.mul_var = P.Mul().set_strategy(data_parallel_strategy_one)
        self.assign_sub_mean = P.AssignSub().set_strategy(
            data_parallel_strategy)
        self.assign_sub_var = P.AssignSub().set_strategy(
            data_parallel_strategy)
Beispiel #7
0
def create_dataset2(dataset_path,
                    do_train,
                    repeat_num=1,
                    batch_size=32,
                    target="Ascend",
                    distribute=False):
    """
    create a train or eval imagenet2012 dataset for resnet50

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        if distribute:
            init()
            rank_id = get_rank()
            device_num = get_group_size()
        else:
            device_num = 1

    if device_num == 1:
        data_set = ds.ImageFolderDataset(dataset_path,
                                         num_parallel_workers=8,
                                         shuffle=True)
    else:
        data_set = ds.ImageFolderDataset(dataset_path,
                                         num_parallel_workers=8,
                                         shuffle=True,
                                         num_shards=device_num,
                                         shard_id=rank_id)

    image_size = 224
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(256),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=trans,
                            input_columns="image",
                            num_parallel_workers=8)
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
# ============================================================================
import numpy as np

import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.common.initializer import initializer
from mindspore.common.parameter import Parameter
from mindspore.communication.management import init, get_rank, get_group_size
from mindspore.ops import operations as P

context.set_context(mode=context.GRAPH_MODE, device_target='GPU')

init()
rank = get_rank()
size = get_group_size()
x = np.ones([3, 1, 3, 3]).astype(np.float32) * 0.01 * (rank + 1)


class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.x1 = Parameter(initializer(Tensor(x), x.shape), name='x1')
        self.x2 = Parameter(initializer(Tensor(x), x.shape), name='x2')
        self.x3 = Parameter(initializer(Tensor(x), x.shape), name='x3')

        self.broadcast1 = P.Broadcast(0)
        self.broadcast2 = P.Broadcast(1)
        self.broadcast3 = P.Broadcast(2)

    def construct(self):
Beispiel #9
0
parser.add_argument(
    '--speaker_id',
    type=str,
    default='',
    help=' Use specific speaker of data in case for multi-speaker datasets.')
parser.add_argument('--is_distributed',
                    action="store_true",
                    default=False,
                    help='Distributed training')
args = parser.parse_args()

if __name__ == '__main__':
    if args.is_distributed:
        init('nccl')
        rank_id = get_rank()
        group_size = get_group_size()
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="GPU",
                            save_graphs=False)
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
    else:
        context.set_context(mode=context.GRAPH_MODE,
                            device_target="GPU",
                            save_graphs=False)
        rank_id = 0
        group_size = 1
Beispiel #10
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        self.batch_size = config.batch_size
        host_device_mix = bool(config.host_device_mix)
        parameter_server = bool(config.parameter_server)
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        if is_auto_parallel:
            self.batch_size = self.batch_size * get_group_size()
        is_field_slice = config.field_slice
        self.field_size = config.field_size
        self.vocab_size = config.vocab_size
        self.emb_dim = config.emb_dim
        self.deep_layer_dims_list = config.deep_layer_dim
        self.deep_layer_act = config.deep_layer_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init
        self.drop_out = config.dropout_flag
        self.keep_prob = config.keep_prob
        self.deep_input_dims = self.field_size * self.emb_dim
        self.layer_dims = self.deep_layer_dims_list + [1]
        self.all_dim_list = [self.deep_input_dims] + self.layer_dims

        init_acts = [('Wide_b', [1], self.emb_init)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.wide_b = var_map["Wide_b"]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True, drop_out=config.dropout_flag)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True, drop_out=config.dropout_flag)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True, drop_out=config.dropout_flag)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True, drop_out=config.dropout_flag)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        use_activation=False, convert_dtype=True, drop_out=config.dropout_flag)
        self.wide_mul = P.Mul()
        self.deep_mul = P.Mul()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.reshape = P.Reshape()
        self.deep_reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
        if is_auto_parallel and host_device_mix and not is_field_slice:
            self.dense_layer_1.dropout.dropout_do_mask.set_strategy(((1, get_group_size()),))
            self.dense_layer_1.dropout.dropout.set_strategy(((1, get_group_size()),))
            self.dense_layer_1.matmul.set_strategy(((1, get_group_size()), (get_group_size(), 1)))
            self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim,
                                                           slice_mode=nn.EmbeddingLookUpSplitMode.TABLE_COLUMN_SLICE)
            self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, 1,
                                                           slice_mode=nn.EmbeddingLookUpSplitMode.TABLE_ROW_SLICE)
            self.deep_mul.set_strategy(((1, 1, get_group_size()), (1, 1, 1)))
            self.deep_reshape.add_prim_attr("skip_redistribution", True)
            self.reduce_sum.add_prim_attr("cross_batch", True)
            self.embedding_table = self.deep_embeddinglookup.embedding_table
        elif is_auto_parallel and host_device_mix and is_field_slice and config.full_batch and config.manual_shape:
            manual_shapes = tuple((s[0] for s in config.manual_shape))
            self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim,
                                                           slice_mode=nn.EmbeddingLookUpSplitMode.FIELD_SLICE,
                                                           manual_shapes=manual_shapes)
            self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, 1,
                                                           slice_mode=nn.EmbeddingLookUpSplitMode.FIELD_SLICE,
                                                           manual_shapes=manual_shapes)
            self.deep_mul.set_strategy(((1, get_group_size(), 1), (1, get_group_size(), 1)))
            self.wide_mul.set_strategy(((1, get_group_size(), 1), (1, get_group_size(), 1)))
            self.reduce_sum.set_strategy(((1, get_group_size(), 1),))
            self.dense_layer_1.dropout.dropout_do_mask.set_strategy(((1, get_group_size()),))
            self.dense_layer_1.dropout.dropout.set_strategy(((1, get_group_size()),))
            self.dense_layer_1.matmul.set_strategy(((1, get_group_size()), (get_group_size(), 1)))
            self.embedding_table = self.deep_embeddinglookup.embedding_table
        elif parameter_server:
            self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim)
            self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, 1)
            self.embedding_table = self.deep_embeddinglookup.embedding_table
            self.deep_embeddinglookup.embedding_table.set_param_ps()
            self.wide_embeddinglookup.embedding_table.set_param_ps()
        else:
            self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target='DEVICE')
            self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, 1, target='DEVICE')
            self.embedding_table = self.deep_embeddinglookup.embedding_table
Beispiel #11
0
def create_dataset(dataset_path,
                   do_train,
                   config,
                   device_target,
                   repeat_num=1,
                   batch_size=32):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1.
        batch_size(int): the batch size of dataset. Default: 32.

    Returns:
        dataset
    """
    if device_target == "Ascend":
        rank_size = int(os.getenv("RANK_SIZE"))
        rank_id = int(os.getenv("RANK_ID"))
        columns_list = ['image', 'label']
        if config.data_load_mode == "mindrecord":
            load_func = partial(de.MindDataset, dataset_path, columns_list)
        else:
            load_func = partial(de.ImageFolderDataset, dataset_path)
        if do_train:
            if rank_size == 1:
                ds = load_func(num_parallel_workers=8, shuffle=True)
            else:
                ds = load_func(num_parallel_workers=8,
                               shuffle=True,
                               num_shards=rank_size,
                               shard_id=rank_id)
        else:
            ds = load_func(num_parallel_workers=8, shuffle=False)
    elif device_target == "GPU":
        if do_train:
            from mindspore.communication.management import get_rank, get_group_size
            ds = de.ImageFolderDataset(dataset_path,
                                       num_parallel_workers=8,
                                       shuffle=True,
                                       num_shards=get_group_size(),
                                       shard_id=get_rank())
        else:
            ds = de.ImageFolderDataset(dataset_path,
                                       num_parallel_workers=8,
                                       shuffle=True)
    else:
        raise ValueError("Unsupported device_target.")

    resize_height = config.image_height

    if do_train:
        buffer_size = 20480
        # apply shuffle operations
        ds = ds.shuffle(buffer_size=buffer_size)

    # define map operations
    decode_op = C.Decode()
    resize_crop_decode_op = C.RandomCropDecodeResize(resize_height,
                                                     scale=(0.08, 1.0),
                                                     ratio=(0.75, 1.333))
    horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5)

    resize_op = C.Resize(256)
    center_crop = C.CenterCrop(resize_height)
    normalize_op = C.Normalize(mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
                               std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
    change_swap_op = C.HWC2CHW()

    if do_train:
        trans = [
            resize_crop_decode_op, horizontal_flip_op, normalize_op,
            change_swap_op
        ]
    else:
        trans = [
            decode_op, resize_op, center_crop, normalize_op, change_swap_op
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=16)
    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Beispiel #12
0
def create_dataset(dataset_path,
                   do_train,
                   repeat_num=1,
                   batch_size=32,
                   target="Ascend"):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num = int(os.getenv("RANK_SIZE"))
        rank_id = int(os.getenv("RANK_ID"))
    else:
        init()
        rank_id = get_rank()
        device_num = get_group_size()

    columns_list = ['image', 'label']
    if config.data_load_mode == "mindrecord":
        load_func = partial(ds.MindDataset, dataset_path, columns_list)
    else:
        load_func = partial(ds.ImageFolderDataset, dataset_path)
    if device_num == 1:
        data_set = load_func(num_parallel_workers=8, shuffle=True)
    else:
        data_set = load_func(num_parallel_workers=8,
                             shuffle=True,
                             num_shards=device_num,
                             shard_id=rank_id)

    image_size = 224
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(256),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=trans,
                            input_columns="image",
                            num_parallel_workers=8)
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
Beispiel #13
0
def create_dataset_py(dataset_path,
                      do_train,
                      repeat_num=1,
                      batch_size=32,
                      target="Ascend"):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num = int(os.getenv("RANK_SIZE"))
        rank_id = int(os.getenv("RANK_ID"))
    else:
        init()
        rank_id = get_rank()
        device_num = get_group_size()

    if do_train:
        if device_num == 1:
            data_set = ds.ImageFolderDataset(dataset_path,
                                             num_parallel_workers=8,
                                             shuffle=True)
        else:
            data_set = ds.ImageFolderDataset(dataset_path,
                                             num_parallel_workers=8,
                                             shuffle=True,
                                             num_shards=device_num,
                                             shard_id=rank_id)
    else:
        data_set = ds.ImageFolderDataset(dataset_path,
                                         num_parallel_workers=8,
                                         shuffle=False)

    image_size = 224

    # define map operations
    decode_op = P.Decode()
    resize_crop_op = P.RandomResizedCrop(image_size,
                                         scale=(0.08, 1.0),
                                         ratio=(0.75, 1.333))
    horizontal_flip_op = P.RandomHorizontalFlip(prob=0.5)

    resize_op = P.Resize(256)
    center_crop = P.CenterCrop(image_size)
    to_tensor = P.ToTensor()
    normalize_op = P.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])

    # define map operations
    if do_train:
        trans = [
            decode_op, resize_crop_op, horizontal_flip_op, to_tensor,
            normalize_op
        ]
    else:
        trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op]

    compose = P2.Compose(trans)
    data_set = data_set.map(operations=compose,
                            input_columns="image",
                            num_parallel_workers=8,
                            python_multiprocessing=True)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
Beispiel #14
0
def parse_args(cloud_args=None):
    """parse_args"""
    parser = argparse.ArgumentParser('mindspore classification test')
    parser.add_argument('--platform',
                        type=str,
                        default='Ascend',
                        choices=('Ascend', 'GPU'),
                        help='run platform')

    # dataset related
    parser.add_argument('--data_dir',
                        type=str,
                        default='/opt/npu/datasets/classification/val',
                        help='eval data dir')
    parser.add_argument('--per_batch_size',
                        default=32,
                        type=int,
                        help='batch size for per npu')
    # network related
    parser.add_argument('--graph_ckpt',
                        type=int,
                        default=1,
                        help='graph ckpt or feed ckpt')
    parser.add_argument('--pretrained',
                        default='',
                        type=str,
                        help='fully path of pretrained model to load. '
                        'If it is a direction, it will test all ckpt')

    # logging related
    parser.add_argument('--log_path',
                        type=str,
                        default='outputs/',
                        help='path to save log')
    parser.add_argument('--is_distributed',
                        type=int,
                        default=0,
                        help='if multi device')

    # roma obs
    parser.add_argument('--train_url', type=str, default="", help='train url')

    args, _ = parser.parse_known_args()
    args = merge_args(args, cloud_args)
    args.image_size = config.image_size
    args.num_classes = config.num_classes
    args.backbone = config.backbone
    args.rank = config.rank
    args.group_size = config.group_size

    args.image_size = list(map(int, args.image_size.split(',')))

    # init distributed
    if args.is_distributed:
        if args.platform == "Ascend":
            init()
        elif args.platform == "GPU":
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()
    else:
        args.rank = 0
        args.group_size = 1

    args.outputs_dir = os.path.join(
        args.log_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)
    return args
Beispiel #15
0
def dpn_train(args):
    # init context
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend", save_graphs=False, device_id=device_id)
    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
        context.set_auto_parallel_context(device_num=args.group_size, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1
    # create dataset
    args.train_dir = os.path.join(args.data_dir, 'train')
    args.eval_dir = os.path.join(args.data_dir, 'val')
    train_dataset = classification_dataset(args.train_dir,
                                           image_size=args.image_size,
                                           per_batch_size=args.batch_size,
                                           max_epoch=1,
                                           num_parallel_workers=args.num_parallel_workers,
                                           shuffle=True,
                                           rank=args.rank,
                                           group_size=args.group_size)
    if args.eval_each_epoch:
        print("create eval_dataset")
        eval_dataset = classification_dataset(args.eval_dir,
                                              image_size=args.image_size,
                                              per_batch_size=args.batch_size,
                                              max_epoch=1,
                                              num_parallel_workers=args.num_parallel_workers,
                                              shuffle=False,
                                              rank=args.rank,
                                              group_size=args.group_size,
                                              mode='eval')
    train_step_size = train_dataset.get_dataset_size()

    # choose net
    net = dpns[args.backbone](num_classes=args.num_classes)

    # load checkpoint
    if os.path.isfile(args.pretrained):
        print("load ckpt")
        load_param_into_net(net, load_checkpoint(args.pretrained))
    # learing rate schedule
    if args.lr_schedule == 'drop':
        print("lr_schedule:drop")
        lr = Tensor(get_lr_drop(global_step=args.global_step,
                                total_epochs=args.epoch_size,
                                steps_per_epoch=train_step_size,
                                lr_init=args.lr_init,
                                factor=args.factor))
    elif args.lr_schedule == 'warmup':
        print("lr_schedule:warmup")
        lr = Tensor(get_lr_warmup(global_step=args.global_step,
                                  total_epochs=args.epoch_size,
                                  steps_per_epoch=train_step_size,
                                  lr_init=args.lr_init,
                                  lr_max=args.lr_max,
                                  warmup_epochs=args.warmup_epochs))

    # optimizer
    opt = SGD(net.trainable_params(),
              lr,
              momentum=args.momentum,
              weight_decay=args.weight_decay,
              loss_scale=args.loss_scale_num)
    # loss scale
    loss_scale = FixedLossScaleManager(args.loss_scale_num, False)
    # loss function
    if args.dataset == "imagenet-1K":
        print("Use SoftmaxCrossEntropyWithLogits")
        loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    else:
        if not args.label_smooth:
            args.label_smooth_factor = 0.0
        print("Use Label_smooth CrossEntropy")
        loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)
    # create model
    model = Model(net, amp_level="O2",
                  keep_batchnorm_fp32=False,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})

    # loss/time monitor & ckpt save callback
    loss_cb = LossMonitor()
    time_cb = TimeMonitor(data_size=train_step_size)
    cb = [loss_cb, time_cb]
    if args.rank_save_ckpt_flag:
        if args.eval_each_epoch:
            save_cb = SaveCallback(model, eval_dataset, args.ckpt_path)
            cb += [save_cb]
        else:
            config_ck = CheckpointConfig(save_checkpoint_steps=train_step_size,
                                         keep_checkpoint_max=args.keep_checkpoint_max)
            ckpoint_cb = ModelCheckpoint(prefix="dpn", directory=args.ckpt_path, config=config_ck)
            cb.append(ckpoint_cb)
    # train model
    model.train(args.epoch_size, train_dataset, callbacks=cb)
Beispiel #16
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 field_size,
                 param_init='normal',
                 target='CPU',
                 slice_mode='batch_slice',
                 feature_num_list=None,
                 max_norm=None,
                 sparse=True,
                 operator='SUM'):
        super(MultiFieldEmbeddingLookup,
              self).__init__(vocab_size, embedding_size, param_init, target,
                             slice_mode, feature_num_list, max_norm, sparse)
        self.field_size = validator.check_positive_int(field_size,
                                                       'field_size')
        self.operator = operator

        self.mul = P.Mul()
        self.inf_mask_mul = P.Mul()
        self.bias_add = P.Add()
        self.inf_add = P.Add()
        self.merge_op = None
        self.count_op = P.UnsortedSegmentSum()
        self.abs = P.Abs()
        self.equal = P.Equal()
        self.add = P.Add()
        self.cast = P.Cast()
        self.div_no_nan = P.DivNoNan()
        self.expand = P.ExpandDims()
        self.max_mask_mul = P.Mul()
        self.max_no_equal = P.NotEqual()

        if operator == MultiFieldEmbeddingLookup.OPERATOR_SUM:
            self.merge_op = P.UnsortedSegmentSum()
        elif operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
            self.merge_op = P.UnsortedSegmentMax()
        elif operator == MultiFieldEmbeddingLookup.OPERATOR_MEAN:
            self.merge_op = P.UnsortedSegmentSum()
        else:
            raise ValueError(
                "The operator supports ['SUM', 'MAX', 'MEAN'], but found: " +
                str(operator))

        parallel_mode = _get_parallel_mode()
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                             ParallelMode.AUTO_PARALLEL)
        if slice_mode in ["table_row_slice", "batch_slice"
                          ] and is_auto_parallel:
            self.merge_op.shard(
                ((get_group_size(), 1, 1), (get_group_size(), 1)))
            self.expand.shard(((get_group_size(), ), ))
            self.bias_add.shard(((1, 1), (1, 1)))
            self.mul.shard(
                ((get_group_size(), 1, 1), (get_group_size(), 1, 1)))
            self.count_op.shard(((get_group_size(), 1), (get_group_size(), 1)))
            self.add.shard(((get_group_size(), ), (get_group_size(), )))
            self.div_no_nan.shard(
                ((get_group_size(), 1), (get_group_size(), 1)))
            self.max_mask_mul.shard(
                ((get_group_size(), 1), (get_group_size(), 1)))
            self.max_no_equal.shard(((1, ), ()))
            if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
                self.equal.shard(((get_group_size(), 1, 1), ()))
                self.inf_mask_mul.shard(((get_group_size(), 1, 1), ()))
                self.merge_op.shard(
                    ((get_group_size(), 1), (get_group_size(), )))
                self.count_op.shard(
                    ((get_group_size(), ), (get_group_size(), )))
                self.inf_add.shard(
                    ((get_group_size(), 1, 1), (get_group_size(), 1, 1)))
        elif slice_mode == "table_column_slice" and is_auto_parallel:
            self.merge_op.shard(((1, 1, get_group_size()), (1, 1)))
            self.div_no_nan.shard(((1, get_group_size()), (1, 1)))
            self.bias_add.shard(((1, 1), (1, 1)))
            self.mul.shard(((1, 1, 1), (1, 1, get_group_size())))
            self.count_op.shard(((1, 1), (1, 1)))
            self.add.shard(((1, ), (1, )))
            self.max_mask_mul.shard(((1, get_group_size()), (1, 1)))
            self.expand.shard(((1, ), ))
            self.max_no_equal.shard(((1, ), ()))
            if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
                self.equal.shard(((1, 1, 1), ()))
                self.inf_mask_mul.shard(((1, 1, 1), ()))
                self.merge_op.shard(((1, get_group_size()), (1, )))
                self.count_op.shard(((1, ), (1, )))
                self.inf_add.shard(((1, 1, get_group_size()), (1, 1, 1)))
        else:
            if is_auto_parallel:
                raise ValueError(
                    "slice_mode should be  ['table_row_slice', 'batch_slice' and \
                       'table_column_slice'], but get " + str(slice_mode))

        # Min value for fp32
        self.negative_inf_value = -3.402823466E+38
Beispiel #17
0
def train():
    # set args
    dev = "GPU"
    epoch_size = int(args_opt.epoch_size)
    total_batch = int(args_opt.batch_size)
    print_per_steps = int(args_opt.print_per_steps)
    compute_type = str(args_opt.dtype).lower()
    ckpt_save_dir = str(args_opt.ckpt_path)
    save_ckpt = bool(args_opt.save_ckpt)
    device_num = 1
    # init context
    if args_opt.mode == "GRAPH":
        mode = context.GRAPH_MODE
        all_reduce_fusion_config = [85, 160]
    else:
        mode = context.PYNATIVE_MODE
        all_reduce_fusion_config = [30, 90, 160]
    context.set_context(mode=mode, device_target=dev, save_graphs=False)
    if args_opt.run_distribute:
        init()
        device_num = get_group_size()
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            all_reduce_fusion_config=all_reduce_fusion_config)
        ckpt_save_dir = ckpt_save_dir + "ckpt_" + str(get_rank()) + "/"

    # create dataset
    dataset = create_dataset(dataset_path=args_opt.dataset_path,
                             do_train=True,
                             repeat_num=1,
                             batch_size=total_batch,
                             target=dev,
                             dtype=compute_type,
                             device_num=device_num)
    step_size = dataset.get_dataset_size()
    if (print_per_steps > step_size or print_per_steps < 1):
        print("Arg: print_per_steps should lessequal to dataset_size ",
              step_size)
        print("Change to default: 20")
        print_per_steps = 20
    # define net
    net = resnet(class_num=1001, dtype=compute_type)

    # init weight
    for _, cell in net.cells_and_names():
        if isinstance(cell, nn.Conv2d):
            cell.weight.set_data(
                weight_init.initializer(weight_init.XavierUniform(),
                                        cell.weight.shape, cell.weight.dtype))
        if isinstance(cell, nn.Dense):
            cell.weight.set_data(
                weight_init.initializer(weight_init.TruncatedNormal(),
                                        cell.weight.shape, cell.weight.dtype))

    # init lr
    lr = get_liner_lr(lr_init=0,
                      lr_end=0,
                      lr_max=0.8,
                      warmup_epochs=0,
                      total_epochs=epoch_size,
                      steps_per_epoch=step_size)
    lr = Tensor(lr)

    # define opt
    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)

    # define loss, model
    loss = CrossEntropySmooth(sparse=True,
                              reduction='mean',
                              smooth_factor=0.1,
                              num_classes=1001)
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                   0.9, 1e-4)
    loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False)
    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
    # Mixed precision
    if compute_type == "fp16":
        if mode == context.PYNATIVE_MODE:
            opt = MomentumWeightDecay(
                filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                0.9, 1e-4, 1024)
        else:
            opt = Momentum(
                filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                0.9, 1e-4, 1024)
        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale,
                      metrics={'acc'},
                      amp_level="O2",
                      keep_batchnorm_fp32=False)
    # define callbacks
    if mode == context.PYNATIVE_MODE:
        print_per_steps = 1
    time_cb = MyTimeMonitor(total_batch, print_per_steps, step_size, mode)
    cb = [time_cb]
    if save_ckpt:
        config_ck = CheckpointConfig(save_checkpoint_steps=5 * step_size,
                                     keep_checkpoint_max=5)
        ckpt_cb = ModelCheckpoint(prefix="resnet_benchmark",
                                  directory=ckpt_save_dir,
                                  config=config_ck)
        cb += [ckpt_cb]
    # train model
    print("========START RESNET50 GPU BENCHMARK========")
    if mode == context.GRAPH_MODE:
        model.train(int(epoch_size * step_size / print_per_steps),
                    dataset,
                    callbacks=cb,
                    sink_size=print_per_steps)
    else:
        model.train(epoch_size, dataset, callbacks=cb)
Beispiel #18
0
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    host_device_mix = bool(config.host_device_mix)
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        if config.field_slice:
            compute_manual_shape(config, get_group_size())
            ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                      batch_size=batch_size*get_group_size(), data_type=dataset_type,
                                      manual_shape=config.manual_shape, target_column=config.field_size)
            ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                     batch_size=batch_size*get_group_size(), data_type=dataset_type,
                                     manual_shape=config.manual_shape, target_column=config.field_size)
        else:
            ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                      batch_size=batch_size*get_group_size(), data_type=dataset_type)
            ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                     batch_size=batch_size*get_group_size(), data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size, rank_id=get_rank(),
                                  rank_size=get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size, rank_id=get_rank(),
                                 rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(
        model, ds_eval, auc_metric, config, host_device_mix=host_device_mix)

    callback = LossCallBack(config=config, per_print_times=20)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs,
                                  keep_checkpoint_max=5, integrated_save=False)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig)
    context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
    callback_list = [TimeMonitor(
        ds_train.get_dataset_size()), eval_callback, callback]
    if not host_device_mix:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train, callbacks=callback_list,
                dataset_sink_mode=(not host_device_mix))
Beispiel #19
0
def run_pretrain(args_opt):
    """pre-train bert"""
    global device_id
    global device_num
    global rank_id
    global job_id
    args_opt.device_id = device_id
    args_opt.device_num = device_num
    sync_dataset(args_opt.data_url)

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() * new_repeat_count,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * new_repeat_count,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    print("Enable save checkpoint: ", args_opt.enable_save_ckpt)
    print("Rank ID: ", rank_id)
    if args_opt.enable_save_ckpt == "true" and rank_id % device_num == 0:
        print("Enable save checkpoint")
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=ckpt_save_dir,
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Beispiel #20
0
def test():
    """test method"""

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    # logger
    args.outputs_dir = os.path.join(args.log_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    args.logger = get_logger(args.outputs_dir, args.rank)

    context.reset_auto_parallel_context()
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    context.set_auto_parallel_context(parallel_mode=parallel_mode, gradients_mean=True, device_num=1)

    args.logger.info('Creating Network....')
    network = YOLOV4CspDarkNet53(is_training=False)

    args.logger.info(args.pretrained)
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('yolo_network.'):
                param_dict_new[key[13:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load_model {} success'.format(args.pretrained))
    else:
        args.logger.info('{} not exists or not a pre-trained file'.format(args.pretrained))
        assert FileNotFoundError('{} not exists or not a pre-trained file'.format(args.pretrained))
        exit(1)

    data_root = args.data_root
    # annFile = args.annFile

    config = ConfigYOLOV4CspDarkNet53()
    if args.testing_shape:
        config.test_img_shape = conver_testing_shape(args)

    data_txt = os.path.join(args.data_dir, 'testdev2017.txt')
    ds, data_size = create_yolo_datasetv2(data_root, data_txt=data_txt, batch_size=args.per_batch_size,
                                          max_epoch=1, device_num=args.group_size, rank=args.rank, shuffle=False,
                                          config=config)

    args.logger.info('testing shape : {}'.format(config.test_img_shape))
    args.logger.info('totol {} images to eval'.format(data_size))

    network.set_train(False)

    # init detection engine
    detection = DetectionEngine(args)

    input_shape = Tensor(tuple(config.test_img_shape), ms.float32)
    args.logger.info('Start inference....')
    for i, data in enumerate(ds.create_dict_iterator()):
        image = Tensor(data["image"])

        image_shape = Tensor(data["image_shape"])
        image_id = Tensor(data["img_id"])

        prediction = network(image, input_shape)
        output_big, output_me, output_small = prediction
        output_big = output_big.asnumpy()
        output_me = output_me.asnumpy()
        output_small = output_small.asnumpy()
        image_id = image_id.asnumpy()
        image_shape = image_shape.asnumpy()

        detection.detect([output_small, output_me, output_big], args.per_batch_size, image_shape, image_id)
        if i % 1000 == 0:
            args.logger.info('Processing... {:.2f}% '.format(i * args.per_batch_size / data_size * 100))

    args.logger.info('Calculating mAP...')
    detection.do_nms_for_results()
    result_file_path = detection.write_result()
    args.logger.info('result file path: {}'.format(result_file_path))
Beispiel #21
0
if args_opt.platform == "Ascend":
    device_id = int(os.getenv('DEVICE_ID'))
    rank_id = int(os.getenv('RANK_ID'))
    rank_size = int(os.getenv('RANK_SIZE'))
    run_distribute = rank_size > 1
    device_id = int(os.getenv('DEVICE_ID'))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=device_id,
                        save_graphs=False)
elif args_opt.platform == "GPU":
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="GPU",
                        save_graphs=False)
    init("nccl")
    context.set_auto_parallel_context(device_num=get_group_size(),
                                      parallel_mode=ParallelMode.DATA_PARALLEL,
                                      mirror_mean=True)
else:
    raise ValueError("Unsupported device target.")


class CrossEntropyWithLabelSmooth(_Loss):
    """
    CrossEntropyWith LabelSmooth.

    Args:
        smooth_factor (float): smooth factor, default=0.
        num_classes (int): num classes

    Returns:
Beispiel #22
0
 def __init__(self,
              vocab_size,
              embedding_size,
              param_init='normal',
              target='CPU',
              slice_mode='batch_slice',
              manual_shapes=None,
              max_norm=None):
     super(EmbeddingLookup, self).__init__()
     self.target = target
     if target not in ('CPU', 'DEVICE'):
         raise ValueError(
             'Attr \'target\' of \'EmbeddingLookup\' Op passed ' +
             str(target) +
             ', should be one of values in \'CPU\', \'DEVICE\'.')
     self.gatherv2 = P.GatherV2()
     self.embeddinglookup = P.EmbeddingLookup().add_prim_attr(
         'primitive_target', 'CPU')
     self.vocab_size = validator.check_value_type('vocab_size', vocab_size,
                                                  [int], self.cls_name)
     self.embedding_size = validator.check_value_type(
         'embedding_size', embedding_size, [int], self.cls_name)
     self.embedding_table = Parameter(initializer(
         param_init, [self.vocab_size, self.embedding_size]),
                                      name='embedding_table')
     parallel_mode = _get_parallel_mode()
     is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                          ParallelMode.AUTO_PARALLEL)
     if slice_mode == "field_slice" and is_auto_parallel:
         if not manual_shapes:
             raise ValueError(
                 "in slice field mode, the manual_shapes should not be none"
             )
         if not isinstance(manual_shapes, tuple):
             raise TypeError(
                 "manual_shapes type must be tuple(int) cannot be {}!".
                 format(type(manual_shapes)))
         for dim in manual_shapes:
             validator.check_positive_int(dim, 'manual shape dim',
                                          self.cls_name)
         self.gatherv2.add_prim_attr("manual_split", manual_shapes)
         self.embeddinglookup.add_prim_attr("manual_split", manual_shapes)
         self.gatherv2.shard(((get_group_size(), 1), (1, get_group_size())))
         self.embeddinglookup.shard(
             ((get_group_size(), 1), (1, get_group_size())))
     elif slice_mode == "table_row_slice" and is_auto_parallel:
         self.gatherv2.shard(((get_group_size(), 1), (1, 1)))
         self.embeddinglookup.shard(((get_group_size(), 1), (1, 1)))
     elif slice_mode == "table_column_slice" and is_auto_parallel:
         self.gatherv2.shard(((1, get_group_size()), (1, 1)))
         self.embeddinglookup.shard(((1, get_group_size()), (1, 1)))
     elif slice_mode == "batch_slice" and is_auto_parallel:
         self.gatherv2.shard(((1, 1), (get_group_size(), 1)))
         self.embeddinglookup.shard(((1, 1), (get_group_size(), 1)))
     else:
         if is_auto_parallel:
             raise ValueError(
                 "slice_mode should support mode in nn.EmbeddingLookup, but get "
                 + str(slice_mode))
     self.max_norm = max_norm
     if self.max_norm is not None:
         self.max_norm = validator.check_positive_float(
             self.max_norm, 'max_norm', self.cls_name)
         self.max_norm = Tensor(self.max_norm, dtype=mstype.float32)
Beispiel #23
0
def train():
    """Train function."""

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir,
                                        "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    if args.group_size > 1:
        args.max_epoch = params["max_epoch_train_NP"]
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
        params['train_type'] = params['train_type_NP']
        params['optimizer'] = params['optimizer_NP']
        params['group_params'] = params['group_params_NP']
    else:
        args.max_epoch = params["max_epoch_train"]
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'],
                          vgg_with_bn=params['vgg_with_bn'])
    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')
        return 0

    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train,
                                      args.imgpath_train,
                                      args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       args.max_epoch,
                                       args.lr_steps,
                                       args.group_size,
                                       lr_type=params['lr_type'],
                                       warmup_epoch=params['warmup_epoch'])

    # optimizer
    if params['group_params']:
        vgg19_base_params = list(
            filter(lambda x: 'base.vgg_base' in x.name,
                   train_net.trainable_params()))
        base_params = list(
            filter(lambda x: 'base.conv' in x.name,
                   train_net.trainable_params()))
        stages_params = list(
            filter(lambda x: 'base' not in x.name,
                   train_net.trainable_params()))

        group_params = [{
            'params': vgg19_base_params,
            'lr': lr_vgg
        }, {
            'params': base_params,
            'lr': lr_base
        }, {
            'params': stages_params,
            'lr': lr_stage
        }]

        if params['optimizer'] == "Momentum":
            opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(group_params)
        else:
            raise ValueError("optimizer not support.")
    else:
        if params['optimizer'] == "Momentum":
            opt = Momentum(train_net.trainable_params(),
                           learning_rate=lr_stage,
                           momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(train_net.trainable_params(), learning_rate=lr_stage)
        else:
            raise ValueError("optimizer not support.")

    # callback
    config_ck = CheckpointConfig(
        save_checkpoint_steps=params['ckpt_interval'],
        keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank),
                                 directory=args.outputs_dir,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    if args.rank == 0:
        callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    else:
        callback_list = [MyLossMonitor(), time_cb]

    # train
    if params['train_type'] == 'clip_grad':
        train_net = TrainOneStepWithClipGradientCell(train_net,
                                                     opt,
                                                     sens=args.loss_scale)
        train_net.set_train()
        model = Model(train_net)
    elif params['train_type'] == 'fix_loss_scale':
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        train_net.set_train()
        model = Model(train_net,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Type {} is not support.".format(
            params['train_type']))

    print("============== Starting Training ==============")
    model.train(args.max_epoch,
                de_dataset_train,
                callbacks=callback_list,
                dataset_sink_mode=False)
    return 0
Beispiel #24
0
def _setup_parallel_env():
    context.reset_auto_parallel_context()
    MultiAscend.init()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL,
                                      device_num=MultiAscend.get_group_size(),
                                      gradients_mean=True)
Beispiel #25
0
def create_dataset1(dataset_path,
                    do_train,
                    repeat_num=1,
                    batch_size=32,
                    target="Ascend",
                    distribute=False):
    """
    create a train or evaluate cifar10 dataset for resnet50
    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
        distribute(bool): data for distribute or not. Default: False

    Returns:
        dataset
    """
    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        if distribute:
            init()
            rank_id = get_rank()
            device_num = get_group_size()
        else:
            device_num = 1
    if device_num == 1:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True)
    else:
        data_set = ds.Cifar10Dataset(dataset_path,
                                     num_parallel_workers=8,
                                     shuffle=True,
                                     num_shards=device_num,
                                     shard_id=rank_id)

    # define map operations
    trans = []
    if do_train:
        trans += [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5)
        ]

    trans += [
        C.Resize((224, 224)),
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=type_cast_op,
                            input_columns="label",
                            num_parallel_workers=8)
    data_set = data_set.map(operations=trans,
                            input_columns="image",
                            num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    sparse = config.sparse
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              rank_id=get_rank(),
                              rank_size=get_group_size(),
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             rank_id=get_rank(),
                             rank_size=get_group_size(),
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size(),
                dataset_sink_mode=(not sparse))
Beispiel #27
0
    return args_opt


if __name__ == '__main__':
    args = parse_args()

    device_num = int(os.environ.get("DEVICE_NUM", 1))
    if args.is_distributed:
        if args.device_target == "Ascend":
            init()
            context.set_context(device_id=args.device_id)
        elif args.device_target == "GPU":
            init("nccl")

        args.rank = get_rank()
        args.group_size = get_group_size()
        device_num = args.group_size
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          parameter_broadcast=True, mirror_mean=True)
    else:
        context.set_context(device_id=args.device_id)
    context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1
Beispiel #28
0
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.9,
                 affine=True,
                 gamma_init='ones',
                 beta_init='zeros',
                 moving_mean_init='zeros',
                 moving_var_init='ones',
                 use_batch_statistics=None,
                 device_num_each_group=1,
                 input_dims='2d',
                 data_format='NCHW'):
        super(_BatchNorm, self).__init__()
        if num_features < 1:
            raise ValueError("num_features must be at least 1")

        if momentum < 0 or momentum > 1:
            raise ValueError("momentum should be a number in range [0, 1], but got {}".format(momentum))
        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name)
        if context.get_context("device_target") != "GPU" and self.format == "NHWC":
            raise ValueError("NHWC format only support in GPU target.")
        self.use_batch_statistics = use_batch_statistics
        self.num_features = num_features
        self.eps = eps
        self.input_dims = input_dims
        self.moving_mean = Parameter(initializer(
            moving_mean_init, num_features), name="mean", requires_grad=False)
        self.moving_variance = Parameter(initializer(
            moving_var_init, num_features), name="variance", requires_grad=False)
        self.gamma = Parameter(initializer(
            gamma_init, num_features), name="gamma", requires_grad=affine)
        self.beta = Parameter(initializer(
            beta_init, num_features), name="beta", requires_grad=affine)
        self.group = validator.check_positive_int(device_num_each_group)
        self.is_global = False
        if self.group != 1:
            self.rank_id = get_rank()
            self.rank_size = get_group_size()
            self.device_list = [i for i in range(0, self.rank_size)]
            self.rank_list = self.list_group(self.device_list, self.group)
            self.rank_list_idx = len(self.rank_list)
            for i in range(self.rank_list_idx):
                if self.rank_id in self.rank_list[i] and self.group != 1:
                    self.is_global = True
                    management.create_group('group' + str(i), self.rank_list[i])
                    self.all_reduce = P.AllReduce(P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1)
        self.shape = P.Shape()
        self.reduce_mean = P.ReduceMean(keep_dims=True)
        self.square = P.Square()
        self.sqrt = P.Sqrt()
        self.cast = P.Cast()
        self.dtype = P.DType()
        self.reshape = P.Reshape()
        self.is_ascend = context.get_context("device_target") == "Ascend"
        self.is_gpu = context.get_context("device_target") == "GPU"
        self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
        self.momentum = 1.0 - momentum
        if context.get_context("enable_ge"):
            self.is_ge_backend = True
        else:
            self.is_ge_backend = False

        if self.is_graph_mode and (self.is_ge_backend or self.is_ascend):
            self.bn_train = P.BatchNorm(is_training=True,
                                        epsilon=self.eps)
        elif self.is_gpu:
            self.bn_train = P.FusedBatchNormEx(mode=1,
                                               epsilon=self.eps,
                                               momentum=self.momentum,
                                               data_format=self.format)
        else:
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
        self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format)
        self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend))
        self.enable_default_train = self.is_graph_mode and not self.is_global and \
                                    (self.is_ge_backend or self.is_ascend)

        data_parallel_strategy = ((1,), (1,))
        data_parallel_strategy_one = ((1,), ())
        self.sub_mean = P.Sub().shard(data_parallel_strategy)
        self.sub_var = P.Sub().shard(data_parallel_strategy)
        self.mul_mean = P.Mul().shard(data_parallel_strategy_one)
        self.mul_var = P.Mul().shard(data_parallel_strategy_one)
        self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy)
        self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)
Beispiel #29
0
def parse_args(cloud_args=None):
    """parameters"""
    parser = argparse.ArgumentParser('mindspore classification training')
    parser.add_argument('--platform',
                        type=str,
                        default='Ascend',
                        choices=('Ascend', 'GPU'),
                        help='run platform')

    # dataset related
    parser.add_argument('--data_dir',
                        type=str,
                        default='',
                        help='train data dir')
    parser.add_argument('--per_batch_size',
                        default=128,
                        type=int,
                        help='batch size for per gpu')
    # network related
    parser.add_argument('--pretrained',
                        default='',
                        type=str,
                        help='model_path, local pretrained model to load')

    # distributed related
    parser.add_argument('--is_distributed',
                        type=int,
                        default=1,
                        help='if multi device')
    # roma obs
    parser.add_argument('--train_url', type=str, default="", help='train url')

    args, _ = parser.parse_known_args()
    args = merge_args(args, cloud_args)
    args.image_size = config.image_size
    args.num_classes = config.num_classes
    args.lr = config.lr
    args.lr_scheduler = config.lr_scheduler
    args.lr_epochs = config.lr_epochs
    args.lr_gamma = config.lr_gamma
    args.eta_min = config.eta_min
    args.T_max = config.T_max
    args.max_epoch = config.max_epoch
    args.backbone = config.backbone
    args.warmup_epochs = config.warmup_epochs
    args.weight_decay = config.weight_decay
    args.momentum = config.momentum
    args.is_dynamic_loss_scale = config.is_dynamic_loss_scale
    args.loss_scale = config.loss_scale
    args.label_smooth = config.label_smooth
    args.label_smooth_factor = config.label_smooth_factor
    args.ckpt_interval = config.ckpt_interval
    args.ckpt_save_max = config.ckpt_save_max
    args.ckpt_path = config.ckpt_path
    args.is_save_on_master = config.is_save_on_master
    args.rank = config.rank
    args.group_size = config.group_size
    args.lr_epochs = list(map(int, args.lr_epochs.split(',')))
    args.image_size = list(map(int, args.image_size.split(',')))

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)
    return args
Beispiel #30
0
 def __init__(self,
              vocab_size,
              embedding_size,
              param_init='normal',
              target='CPU',
              slice_mode='batch_slice',
              manual_shapes=None,
              max_norm=None,
              sparse=True,
              vocab_cache_size=0):
     super(EmbeddingLookup, self).__init__()
     validator.check_value_type('sparse', sparse, [bool], self.cls_name)
     self.vocab_size = validator.check_positive_int(vocab_size,
                                                    'vocab_size')
     self.vocab_cache_size = validator.check_non_negative_int(
         vocab_cache_size, 'vocab_cache_size')
     self.target = target
     self.sparse = sparse
     self.cache_enable = self.vocab_cache_size > 0
     self.forward_unique = False
     if target not in ('CPU', 'DEVICE'):
         raise ValueError(
             'Attr \'target\' of \'EmbeddingLookup\' Op passed ' +
             str(target) +
             ', should be one of values in \'CPU\', \'DEVICE\'.')
     if not sparse and target == 'CPU':
         raise ValueError(
             'When target is CPU, embedding_lookup must be sparse.')
     if sparse:
         self.gatherv2 = P.SparseGatherV2()
     else:
         self.gatherv2 = P.Gather()
     self.embeddinglookup = P.EmbeddingLookup().add_prim_attr(
         'primitive_target', 'CPU')
     enable_ps = _get_ps_context("enable_ps")
     if enable_ps:
         self._process_vocab_cache(slice_mode)
     self.embedding_size = validator.check_positive_int(
         embedding_size, 'embedding_size')
     self.embedding_table = Parameter(initializer(
         param_init, [self.vocab_size, self.embedding_size]),
                                      name='embedding_table')
     parallel_mode = _get_parallel_mode()
     is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                          ParallelMode.AUTO_PARALLEL)
     self.gather_revert = P.Gather()
     self.reshape_first = P.Reshape()
     self.reshape = P.Reshape()
     self.unique = P.Unique()
     self.shape = P.Shape()
     if is_auto_parallel:
         self.unique = P.Unique().shard(((1, ), ))
     if self.cache_enable and enable_ps:
         self._set_voacb_cache_enable_for_ps(vocab_cache_size,
                                             embedding_size, vocab_size)
         if is_auto_parallel:
             self.unique.add_prim_attr('cache_enable', True)
     indices_shape_size = 2
     if slice_mode == "field_slice" and is_auto_parallel:
         if not manual_shapes:
             raise ValueError(
                 "in slice field mode, the manual_shapes should not be none"
             )
         if not isinstance(manual_shapes, tuple):
             raise TypeError(
                 "manual_shapes type must be tuple(int) cannot be {}!".
                 format(type(manual_shapes)))
         for dim in manual_shapes:
             validator.check_positive_int(dim, 'manual shape dim',
                                          self.cls_name)
         self.gatherv2.add_prim_attr("manual_split", manual_shapes)
         self.embeddinglookup.add_prim_attr("manual_split", manual_shapes)
         self.gatherv2.shard(((get_group_size(), 1), (1, get_group_size())))
         self.embeddinglookup.shard(
             ((get_group_size(), 1), (1, get_group_size())))
     elif slice_mode == "table_row_slice" and is_auto_parallel:
         full_batch = _get_full_batch()
         if (target == 'DEVICE'
                 and not full_batch) or (self.cache_enable and enable_ps
                                         and sparse):
             indices_shape_size = 1
             self.gather_revert.shard(((1, 1), (get_group_size(), )))
             self.forward_unique = True
         indices_strategy = (1, ) * indices_shape_size
         self.gatherv2.shard(((get_group_size(), 1), indices_strategy))
         self.embeddinglookup.shard(
             ((get_group_size(), 1), indices_strategy))
     elif slice_mode == "table_column_slice" and is_auto_parallel:
         if target == 'DEVICE':
             indices_shape_size = 1
             self.gather_revert.shard(((1, get_group_size()), (1, )))
             self.forward_unique = True
         indices_strategy = (1, ) * indices_shape_size
         self.gatherv2.shard(((1, get_group_size()), indices_strategy))
         self.embeddinglookup.shard(
             ((1, get_group_size()), indices_strategy))
     elif slice_mode == "batch_slice" and is_auto_parallel:
         indices_strategy = [get_group_size()]
         indices_strategy.extend([1] * (indices_shape_size - 1))
         indices_strategy = tuple(indices_strategy)
         self.gatherv2.shard(((1, 1), indices_strategy))
         self.embeddinglookup.shard(((1, 1), indices_strategy))
     else:
         if is_auto_parallel:
             raise ValueError(
                 "slice_mode should support mode in nn.EmbeddingLookup, but get "
                 + str(slice_mode))
     if self.cache_enable and not enable_ps:
         if parallel_mode != ParallelMode.STAND_ALONE:
             raise ValueError(
                 "parallel mode haven't supported cache enable yet.")
         self._set_cache_enable()
     self.embedding_table.unique = self.forward_unique
     self.max_norm = max_norm
     if self.max_norm is not None:
         self.max_norm = validator.check_positive_float(
             self.max_norm, 'max_norm', self.cls_name)
         self.max_norm = Tensor(self.max_norm, dtype=mstype.float32)