Example #1
0
def test_control_depend_check():
    with pytest.raises(TypeError) as e:
        depend = P.ControlDepend(0.0)
    with pytest.raises(ValueError) as e:
        depend = P.ControlDepend(2)
    with pytest.raises(TypeError) as e:
        depend = P.ControlDepend((2,))
Example #2
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertSquadCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = None
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("mirror_mean")
         degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
 def __init__(self, in_channel, x):
     super().__init__()
     #self._save_graphs(save_graph_flag=True, save_graph_path=".")
     self.biasadd = P.BiasAdd()
     self.equal = P.Equal()
     self.addn = P.AddN()
     self.conv = Conv2d(in_channels=in_channel,
                        out_channels=in_channel,
                        kernel_size=1,
                        stride=1,
                        has_bias=False,
                        weight_init='ones',
                        pad_mode='same')
     self.bn = BatchNorm2d(num_features=in_channel)
     self.controldepend = P.ControlDepend()
     self.assignadd = P.AssignAdd()
     self.assign = P.Assign()
     self.relu = ReLU()
     self.mean = P.ReduceMean(keep_dims=False)
     self.bias = Parameter(Tensor(
         np.random.randint(2, size=(3, )).astype((np.float32))),
                           name="bias")
     self.bias2 = Parameter(Tensor(np.ones([3]).astype(np.float32)),
                            name="bias2")
     self.parameterupdate = ParameterUpdate(self.bias)
     self.value = Tensor(np.random.randn(*(3, )), ms.float32)
     self.x = x
Example #4
0
 def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=False):
     super(GPTTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.enable_global_norm = enable_global_norm
     self.grad = C.GradOperation(get_by_list=True,
                                 sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Example #5
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       False, self.degree)
        self.clip_type = gradient_cfg.clip_type
        self.clip_value = gradient_cfg.clip_value
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))

        self.saved_params = self.weights.clone(prefix='saved')
        self.length = len(self.weights)
        self.quant_embedding_list = []
        self.quant_weight_list = []
        for i, key in enumerate(self.saved_params):
            if 'embedding_lookup' in key.name:
                self.quant_embedding_list.append(i)
            elif 'weight' in key.name and 'dense_1' not in key.name:
                self.quant_weight_list.append(i)
        self.quant_embedding_list_length = len(self.quant_embedding_list)
        self.quant_weight_list_length = len(self.quant_weight_list)

        self.quantize_embedding = QuantizeWeightCell(
            num_bits=network.embedding_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
        self.quantize_weight = QuantizeWeightCell(
            num_bits=network.weight_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.all_reduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
        self.add_flags(has_effect=True)
Example #7
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Example #8
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

from mindspore.ops import Primitive
from mindspore.ops import operations as P

all_reduce = P.AllReduce()
broadcast = P.Broadcast(1)
memcpy_async = Primitive('memcpy_async')
make_tuple = Primitive('make_tuple')
tuple_getitem = Primitive('tuple_getitem')
apply_momentun = P.ApplyMomentum()
control_depend = P.ControlDepend()
relu = P.ReLU()


class FnDict:
    def __init__(self):
        self.fnDict = {}

    def __call__(self, fn):
        self.fnDict[fn.__name__] = fn

    def __getitem__(self, name):
        return self.fnDict[name]


def test_insert_memcpy_async_for_hccl_op_cond1(tag):
Example #9
0
 def __init__(self):
     super(Net2, self).__init__()
     self.relu1 = P.ReLU()
     self.relu2 = P.ReLU().add_prim_attr("primitive_target", "CPU")
     self.mul = P.Mul()
     self.control = P.ControlDepend()
Example #10
0
 def __init__(self):
     super(Net1, self).__init__()
     self.relu1 = P.ReLU()
     self.relu2 = P.ReLU()
     self.mul = P.Mul()
     self.control = P.ControlDepend()