def record_point_info(self, point_info, output_path):
        """
        Record point info into json.

        Args:
            point_info (dict): The point info about tag id and relative op name.
            output_path (str): The output path for saving point info.

        Returns:
            dict, parsed point info.
        """
        points = {
            'fp_start': point_info.get(self._fp_tag, ''),
            'bp_end': point_info.get(self._bp_tag, '')
        }
        try:
            with open(output_path, 'w') as json_file:
                json.dump(points, json_file)
            os.chmod(output_path, stat.S_IREAD)
        except (IOError, OSError) as err:
            log.warning('Failed to save point info. %s', err)
            raise ProfilerIOException
        return points
Exemple #2
0
    def _load_inputs(self, *inputs):
        """
        Slice inputs tensors by parallel strategies.

        Args:
            inputs (Function or Cell): inputs of construct method.
        """
        parallel_inputs_run = []
        if len(inputs) > self._construct_inputs_num:
            raise ValueError(
                'Len of inputs: {} is bigger than self._construct_inputs_num: {}.'
                .format(len(inputs), self._construct_inputs_num))
        for i, tensor in enumerate(inputs):
            key = self._construct_inputs_names[i]
            # if input is not used, self.parameter_layout_dict may not contain the key
            if key not in self.parameter_layout_dict:
                logger.warning("layout dict does not contain the key %s", key)
                parallel_inputs_run.append(tensor)
            else:
                layout = self.parameter_layout_dict[key]
                new_tensor = _load_tensor_by_layout(tensor, layout)
                parallel_inputs_run.append(new_tensor)
        return tuple(parallel_inputs_run)
def _fill_scalar_summary(tag: str, np_value, summary):
    """
    Package the scalar summary.

    Args:
        tag (str): Summary tag describe.
        np_value (Object): Scalary object.

    Returns:
        Summary, return scalar summary content.
    """
    logger.debug(f"Set({tag}) the scalar summary value")
    if np_value.size == 1:
        # is scalar
        summary.scalar_value = np_value.item()
        return True
    if np_value.size > 1:
        logger.warning(
            f"The tensor is not a single scalar, tag = {tag}, ndim = {np_value.ndim}, shape = {np_value.shape}")
        summary.scalar_value = next(np_value.flat).item()
        return True
    logger.error(f"There no values inside tensor, tag = {tag}, size = {np_value.size}")
    return False
def _load_dismatch_prefix_params(net, parameter_dict, param_not_load):
    """When some net parameter did not load, try to continue load."""
    prefix_name = ""
    longest_name = param_not_load[0]
    while prefix_name != longest_name and param_not_load:
        logger.debug("Count: {} parameters has not been loaded, try to load continue.".format(len(param_not_load)))
        prefix_name = longest_name
        for net_param_name in param_not_load:
            for dict_name in parameter_dict:
                if dict_name.endswith(net_param_name):
                    prefix_name = dict_name[:-len(net_param_name)]
                    break
            if prefix_name != longest_name:
                break

        if prefix_name != longest_name:
            logger.warning("Remove parameter prefix name: {}, continue to load.".format(prefix_name))
            for _, param in net.parameters_and_names():
                new_param_name = prefix_name + param.name
                if param.name in param_not_load and new_param_name in parameter_dict:
                    new_param = parameter_dict[new_param_name]
                    _update_param(param, new_param)
                    param_not_load.remove(param.name)
Exemple #5
0
    def set_category_field(self, category_field):
        """
        Set category field for reading.

        Note:
            Should be a candidate category field.

        Args:
            category_field (str): String of category field name.

        Returns:
            MSRStatus, SUCCESS or FAILED.
        """
        logger.warning(
            "WARN_DEPRECATED: The usage of set_category_field is deprecated."
            " Please use category_field")
        if not category_field or not isinstance(category_field, str):
            raise ParamTypeError('category_fields', 'str')
        if category_field not in self._candidate_fields:
            raise MRMDefineCategoryError(
                "Field '{}' is not a candidate category field.".format(
                    category_field))
        return self._segment.set_category_field(category_field)
Exemple #6
0
def _check_target_specific_cfgs(device, arg_key):
    """Checking whether a config is sutable for a specified device"""
    device_cfgs = {
        'enable_auto_mixed_precision': ['Ascend'],
        'enable_dump': ['Ascend'],
        'save_dump_path': ['Ascend'],
        'enable_graph_kernel': ['Ascend', 'GPU'],
        'enable_reduce_precision': ['Ascend'],
        'enable_profiling': ['Ascend'],
        'profiling_options': ['Ascend'],
        'print_file_path': ['Ascend'],
        'variable_memory_max_size': ['Ascend'],
        'max_device_memory': ['GPU']
    }
    # configs not in map device_cfgs are supposed to be suitable for all devices
    if not arg_key in device_cfgs:
        return True
    supported_devices = device_cfgs[arg_key]
    if device in supported_devices:
        return True
    logger.warning(f"Config '{arg_key}' only supports devices in {supported_devices}, current device is '{device}'"
                   ", ignore it.")
    return False
    def parse(self):
        """
        Parse the minddata pipeline files.

        Raises:
            ProfilerRawFileException: If fails to parse the raw file of
                minddata pipeline or the file is empty.
        """
        with open(self._pipeline_path, 'r') as file:
            try:
                pipeline_info = json.load(file)
            except (json.JSONDecodeError, TypeError) as err:
                logger.warning(err)
                raise ProfilerRawFileException(
                    'Fail to parse minddata pipeline file.'
                )
        if not pipeline_info:
            logger.warning('The minddata pipeline file is empty.')
            raise ProfilerRawFileException(
                'The minddata pipeline file is empty.'
            )

        self._parse_and_save(pipeline_info)
Exemple #8
0
    def _parse(self, source_files):
        """Parse source step trace files."""
        log.info("Start to parse step trace file.")
        event_info = {}

        self._get_step_end_tag_id(source_files)

        for source_file in source_files:
            source_file = validate_and_normalize_path(source_file)
            try:
                with open(source_file, 'rb') as handler:
                    content = handler.read()
                    for step_trace in self._get_next_step_trace(content, event_info):
                        if self._skip_first_step:
                            self._skip_first_step = False
                            continue
                        self._record_trace_event(step_trace)
            except (IOError, OSError) as err:
                log.warning(f'Failed to read {source_file}', err)
                raise ProfilerIOException

        self._record_average_info()
        log.info("Finish to parse step trace file.")
Exemple #9
0
    def _preprocess_single_lr(self, learning_rate):
        """Check lr value, and convert lr to a float, a Tensor or a LearningRateSchedule."""
        if isinstance(learning_rate, (float, int)):
            learning_rate = float(learning_rate)
            validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name)
            return learning_rate
        if isinstance(learning_rate, Tensor) and learning_rate.dim() == 0:
            return learning_rate

        self.dynamic_lr = True
        if isinstance(learning_rate, Iterable):
            return Tensor(np.array(list(learning_rate)).astype(np.float32))
        if isinstance(learning_rate, Tensor):
            if learning_rate.dim() > 1:
                raise ValueError("The dim of `Tensor` type Learning rate should be a 0 or 1,"
                                 f"but got {learning_rate.dim()}.")
            if learning_rate.dim() == 1 and learning_rate.size() < 2:
                logger.warning("If use `Tensor` type dynamic learning rate, please make sure that the number"
                               "of elements in the tensor passed is greater than 1.")
            return learning_rate
        if isinstance(learning_rate, LearningRateSchedule):
            return learning_rate
        raise TypeError("Learning rate should be int, float, Tensor, Iterable or LearningRateSchedule.")
    def record_point_info(self, source_file, output_path):
        """
        Record point info into json.

        Args:
            source_file (str): The file path of step trace original data.
            output_path (str): The output path for saving point info.

        Returns:
            dict, parsed point info.
        """
        fp_start, bp_end = 0, 1
        try:
            with open(source_file, 'r') as f:
                lines = f.readlines()
                fp_start_name = lines[fp_start].split()[0]
                bp_end_name = lines[bp_end].split()[0]
        except (IOError, OSError) as err:
            log.warning(f'Failed to read {source_file}', err)
            raise ProfilerIOException

        if self._is_training_mode:
            points = {'fp_start': fp_start_name, 'bp_end': bp_end_name}
        else:
            points = {
                'fp_start': fp_start_name,
            }
        if os.path.exists(output_path):
            return points
        try:
            with open(output_path, 'w') as json_file:
                json.dump(points, json_file)
            os.chmod(output_path, stat.S_IRUSR)
        except (IOError, OSError) as err:
            log.warning('Failed to save point info. %s', err)
            raise ProfilerIOException
        return points
Exemple #11
0
def test_tfrecord_to_mindrecord_with_special_field_name():
    """test transform tfrecord to mindrecord."""
    if not tf or tf.__version__ < SupportedTensorFlowVersion:
        # skip the test
        logger.warning("Module tensorflow is not found or version wrong, \
            please use pip install it / reinstall version >= {}.".format(
            SupportedTensorFlowVersion))
        return

    generate_tfrecord_with_special_field_name()
    assert os.path.exists(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))

    feature_dict = {
        "image/class/label": tf.io.FixedLenFeature([], tf.int64),
        "image/encoded": tf.io.FixedLenFeature([], tf.string),
    }

    if os.path.exists(MINDRECORD_FILE_NAME):
        os.remove(MINDRECORD_FILE_NAME)
    if os.path.exists(MINDRECORD_FILE_NAME + ".db"):
        os.remove(MINDRECORD_FILE_NAME + ".db")

    tfrecord_transformer = TFRecordToMR(
        os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME),
        MINDRECORD_FILE_NAME, feature_dict, ["image/encoded"])
    tfrecord_transformer.transform()

    assert os.path.exists(MINDRECORD_FILE_NAME)
    assert os.path.exists(MINDRECORD_FILE_NAME + ".db")

    fr_mindrecord = FileReader(MINDRECORD_FILE_NAME)
    verify_data(tfrecord_transformer, fr_mindrecord)

    os.remove(MINDRECORD_FILE_NAME)
    os.remove(MINDRECORD_FILE_NAME + ".db")

    os.remove(os.path.join(TFRECORD_DATA_DIR, TFRECORD_FILE_NAME))
 def _save(self):
     """save step trace file."""
     BP_POINT, TAIL, FP_DURATION = 5, -1, -2
     log.info("Start to save step trace file.")
     if not self._header:
         return
     try:
         with open(self._output_path, 'w') as file_handle:
             csv_writer = csv.writer(file_handle)
             if not self._is_training_mode:
                 self._header[FP_DURATION] = 'fp'
                 self._header = self._header[:BP_POINT] + self._header[
                     BP_POINT + 1:TAIL]
             csv_writer.writerow(self._header)
             for row_data in self._result:
                 if not self._is_training_mode:
                     row_data[FP_DURATION] += row_data[TAIL]
                     row_data = row_data[:BP_POINT] + row_data[BP_POINT +
                                                               1:TAIL]
                 csv_writer.writerow(row_data)
         os.chmod(self._output_path, stat.S_IRUSR)
     except (IOError, OSError) as err:
         log.warning('Failed to save step trace raw info. %s', err)
         raise ProfilerIOException
Exemple #13
0
 def _get_single_lr(self, learning_rate):
     """Get learning rate in Tensor type."""
     if isinstance(learning_rate, float):
         validator.check_number_range("learning rate", learning_rate, 0.0,
                                      float("inf"), Rel.INC_LEFT,
                                      self.cls_name)
         lr = Tensor(learning_rate, mstype.float32)
     elif isinstance(learning_rate, Iterable):
         lr = Tensor(np.array(list(learning_rate)).astype(np.float32))
     elif isinstance(learning_rate, Tensor):
         if learning_rate.dim() > 1:
             raise ValueError(
                 "Learning rate should be a 0 or 1 dim `Tensor`,"
                 f"but got {learning_rate.dim()}.")
         if learning_rate.dim() == 1 and learning_rate.size() < 2:
             logger.warning(
                 "If want to use the dynamic learning rate, please make sure that the number "
                 "of elements in the list, tuple or tensor passed is greater than 1."
             )
         lr = learning_rate
     else:
         raise TypeError(
             "Learning rate should be float, Tensor or Iterable.")
     return lr
Exemple #14
0
    def calc_faithfulness(self, inputs: _Array, model: _Module,
                          targets: _Label, saliency: _Array) -> np.ndarray:
        """
        Calculate naive faithfulness.

        Args:
            inputs (_Array): sample to calculate faithfulness score
            model (_Module): model to explanation
            targets (_Label): label to explanation on.
            saliency (_Array): Saliency map of given inputs and targets from the
                explainer.

        Return:
            - faithfulness (np.ndarray): faithfulness score

        """
        if not np.count_nonzero(saliency):
            log.warning(
                "The saliency map is zero everywhere. The correlation will be set to zero."
            )
            correlation = 0
            normalized_faithfulness = (correlation + 1) / 2
            return np.array([normalized_faithfulness], np.float)
        reference = self._get_reference(inputs)
        perturbations, masks = self._perturb(inputs,
                                             saliency,
                                             reference,
                                             return_mask=True)
        feature_importance = _calc_feature_importance(saliency, masks)

        perturbations = ms.Tensor(perturbations, dtype=ms.float32)
        predictions = model(perturbations).asnumpy()[:, targets]

        faithfulness = calc_correlation(feature_importance, predictions)
        normalized_faithfulness = (faithfulness + 1) / 2
        return np.array([normalized_faithfulness], np.float)
    def _verify_data(self):
        """Verify dataset and labels."""
        next_element = next(self._dataset.create_tuple_iterator())

        if len(next_element) not in [1, 2, 3]:
            raise ValueError("The dataset should provide [images] or [images, labels], [images, labels, bboxes]"
                             " as columns.")

        if len(next_element) == 3:
            inputs, labels, bboxes = next_element
            if bboxes.shape[-1] != 4:
                raise ValueError("The third element of dataset should be bounding boxes with shape of "
                                 "[batch_size, num_ground_truth, 4].")
        else:
            if self._benchmarkers is not None:
                if any([isinstance(bench, Localization) for bench in self._benchmarkers]):
                    raise ValueError("The dataset must provide bboxes if Localization is to be computed.")

            if len(next_element) == 2:
                inputs, labels = next_element
            if len(next_element) == 1:
                inputs = next_element[0]

        if len(inputs.shape) > 4 or len(inputs.shape) < 3 or inputs.shape[-3] not in [1, 3, 4]:
            raise ValueError(
                "Image shape {} is unrecognizable: the dimension of image can only be CHW or NCHW.".format(
                    inputs.shape))
        if len(inputs.shape) == 3:
            log.warning(
                "Image shape {} is 3-dimensional. All the data will be automatically unsqueezed at the 0-th"
                " dimension as batch data.".format(inputs.shape))
        if len(next_element) > 1:
            if len(labels.shape) > 2 and (np.array(labels.shape[1:]) > 1).sum() > 1:
                raise ValueError(
                    "Labels shape {} is unrecognizable: outputs should not have more than two dimensions"
                    " with length greater than 1.".format(labels.shape))
    def _collect_input_data(self, cb_params):
        """Only support to collect image data."""
        if not self._collect_specified_data.get('collect_input_data'):
            return

        input_data = getattr(cb_params, 'train_dataset_element', None)
        if input_data is None:
            self._collect_specified_data['collect_input_data'] = False
            logger.info(
                "The 'train_dataset_element' in cb_params is None, maybe there is dataset sink mode."
            )
            return

        if isinstance(input_data, (list, tuple)):
            input_data = input_data[0]
        try:
            self._record.add_value(PluginEnum.IMAGE.value, 'input_data/auto',
                                   input_data)
        except ValueError:
            logger.warning(
                'The input data of network are not image, so will not collect by SummaryCollector.'
            )
            self._collect_specified_data['collect_input_data'] = False
            return
Exemple #17
0
    def _package_parameter(key, value, message):
        """
        Package parameters in operation.

        Args:
            key (str): Operation name.
            value (Union[str, bool, int, float, list, None]): Operation args.
            message (OperationParameter): Operation proto message.
        """
        if isinstance(value, str):
            message.mapStr[key] = value
        elif isinstance(value, bool):
            message.mapBool[key] = value
        elif isinstance(value, int):
            message.mapInt[key] = value
        elif isinstance(value, float):
            message.mapDouble[key] = value
        elif isinstance(value, (list, tuple)) and key != "operations":
            if value:
                replace_value_list = list(
                    map(lambda x: "" if x is None else x, value))
                message.mapStrList[key].strValue.extend(replace_value_list)
        elif isinstance(value, dict):
            try:
                message.mapStr[key] = json.dumps(value)
            except TypeError as exo:
                logger.warning(
                    "Transform the value of parameter %r to string failed. Detail: %s.",
                    key, str(exo))
        elif value is None:
            message.mapStr[key] = "None"
        else:
            logger.warning(
                "The parameter %r is not recorded, because its type is not supported in event package. "
                "Its type is %r.", key,
                type(value).__name__)
Exemple #18
0
    def _get_imagenet_as_dict(self):
        """
        Get data from imagenet as dict.

        Yields:
            data (dict of list): imagenet data list which contains dict.
        """
        if not os.path.exists(self.map_file):
            raise IOError("map file {} not exists".format(self.map_file))

        label_dict = {}
        with open(self.map_file) as fp:
            line = fp.readline()
            while line:
                labels = line.split(" ")
                label_dict[labels[1]] = labels[0]
                line = fp.readline()

        # get all the dir which are n02087046, n02094114, n02109525
        dir_paths = {}
        for item in label_dict:
            real_path = os.path.join(self.image_dir, label_dict[item])
            if not os.path.isdir(real_path):
                logger.warning("{} dir is not exist".format(real_path))
                continue
            dir_paths[item] = real_path

        if not dir_paths:
            raise PathNotExistsError("not valid image dir in {}".format(
                self.image_dir))

        # get the filename, label and image binary as a dict
        for label in dir_paths:
            for item in os.listdir(dir_paths[label]):
                file_name = os.path.join(dir_paths[label], item)
                if not item.endswith("JPEG") and not item.endswith("jpg"):
                    logger.warning(
                        "{} file is not suffix with JPEG/jpg, skip it.".format(
                            file_name))
                    continue
                data = {}
                data["file_name"] = str(file_name)
                data["label"] = int(label)

                # get the image data
                image_file = open(file_name, "rb")
                image_bytes = image_file.read()
                image_file.close()
                if not image_bytes:
                    logger.warning(
                        "The image file: {} is invalid.".format(file_name))
                    continue
                data["image"] = image_bytes
                yield data
Exemple #19
0
    def __init__(self, **kwargs):
        # get device_id and device_target
        self._get_devid_and_devtarget()
        output_path = kwargs.pop("output_path", "./data")
        self._output_path = validate_and_normalize_path(output_path)
        self._output_path = os.path.join(self._output_path, "profiler")
        if not os.path.exists(self._output_path):
            os.makedirs(self._output_path, exist_ok=True)
        else:
            logger.warning(
                "The target dir already exists. "
                "There may be some old profiling data, and they will be rewrote in the end."
            )

        if self._device_target and self._device_target == "GPU":
            from mindspore._c_expression import GPUProfiler
            self._gpu_profiler = GPUProfiler.get_instance()
            self._gpu_profiler.init(self._output_path)
            self._gpu_profiler.step_profiling_enable(True)

            if kwargs:
                logger.warning("Params not be supported yet on GPU.")
        elif self._device_target and self._device_target == "Ascend":
            optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable")
            if not isinstance(optypes_not_deal, str):
                raise TypeError("The parameter optypes_not_deal must be str.")
            job_id = kwargs.pop("ascend_job_id", "")
            if kwargs:
                logger.warning("There are invalid params which don't work.")

            os.environ['PROFILING_MODE'] = 'true'
            os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
            os.environ['DEVICE_ID'] = self._dev_id
            os.environ['AICPU_PROFILING_MODE'] = 'true'

            # use context interface to open profiling, for the new mindspore version(after 2020.5.21)
            context.set_context(enable_profiling=True,
                                profiling_options="training_trace:task_trace")

            self._container_path = os.path.join(
                self._base_profiling_container_path, self._dev_id)
            data_path = os.path.join(self._container_path, "data")
            data_path = validate_and_normalize_path(data_path)
            if not os.path.exists(data_path):
                os.makedirs(data_path, exist_ok=True)

            self._filt_optype_names = optypes_not_deal.split(
                ",") if optypes_not_deal else []
            self._profiling_job_id = job_id
            # add job id env through user input later
            self._job_id_env = 0
            self._start_time = int(time.time() * 10000000)
            logger.info("Profiling: profiling start time: %d",
                        self._start_time)
Exemple #20
0
    def _collect_input_data(self, cb_params):
        """Only support to collect image data."""
        if not self._collect_specified_data.get('collect_input_data'):
            return

        input_data = getattr(cb_params, 'train_dataset_element', None)
        if not isinstance(input_data, (Tensor, list, tuple)):
            self._collect_specified_data['collect_input_data'] = False
            logger.warning("The type of input data is not Tensor/list/tuple, "
                           "so SummaryCollector will not collect input data.")
            return

        if not isinstance(input_data, Tensor) and not input_data:
            self._collect_specified_data['collect_input_data'] = False
            logger.warning(
                "The 'train_dataset_element' in cb_params is empty, "
                "so SummaryCollector will not record the input data.")

            if self._dataset_sink_mode and context.get_context(
                    'device_target') == 'Ascend':
                logger.warning(
                    'On Ascend device, SummaryCollector is not supported to record input data '
                    'in dataset sink mode.')
            return

        if isinstance(input_data, (list, tuple)) and input_data:
            input_data = input_data[0]
        try:
            self._record.add_value(PluginEnum.IMAGE.value, 'input_data/auto',
                                   input_data)
        except (TypeError, ValueError):
            logger.warning(
                'The input data of network are not image, so will not collect by SummaryCollector.'
            )
            self._collect_specified_data['collect_input_data'] = False
            return
Exemple #21
0
 def to_tensor(self, slice_index=None, shape=None, opt_shard_group=None):
     """Return init_data()."""
     logger.warning("WARN_DEPRECATED: The usage of to_tensor is deprecated."
                    " Please use init_data")
     return self.init_data(slice_index, shape, opt_shard_group)
Exemple #22
0
    def eval(self, valid_dataset, callbacks=None, dataset_sink_mode=True):
        """
        Evaluation API where the iteration is controlled by python front-end.

        Configure to pynative mode or CPU, the evaluating process will be performed with dataset non-sink mode.

        Note:
            If dataset_sink_mode is True, data will be sent to device. If device is Ascend, features
            of data will be transferred one by one. The limitation of data transmission per time is 256M.

        Args:
            valid_dataset (Dataset): Dataset to evaluate the model.
            callbacks (Optional[list(Callback)]): List of callback objects which should be executed
                while training. Default: None.
            dataset_sink_mode (bool): Determines whether to pass the data through dataset channel.
                Default: True.

        Returns:
            Dict, which returns the loss value and metrics values for the model in the test mode.

        Examples:
            >>> from mindspore import Model, nn
            >>>
            >>> # For details about how to build the dataset, please refer to the tutorial
            >>> # document on the official website.
            >>> dataset = create_custom_dataset()
            >>> net = Net()
            >>> loss = nn.SoftmaxCrossEntropyWithLogits()
            >>> model = Model(net, loss_fn=loss, optimizer=None, metrics={'acc'})
            >>> acc = model.eval(dataset, dataset_sink_mode=False)
        """
        dataset_sink_mode = Validator.check_bool(dataset_sink_mode)

        _device_number_check(self._parallel_mode, self._device_number)
        if not self._metric_fns:
            raise ValueError("metric fn can not be None or empty.")
        if isinstance(self._eval_network,
                      nn.GraphCell) and dataset_sink_mode is True:
            raise ValueError(
                "Sink mode is currently not supported when evaluating with a GraphCell."
            )

        cb_params = _InternalCallbackParam()
        cb_params.eval_network = self._eval_network
        cb_params.valid_dataset = valid_dataset
        cb_params.batch_num = valid_dataset.get_dataset_size()
        cb_params.mode = "eval"
        cb_params.cur_step_num = 0
        cb_params.list_callback = self._transform_callbacks(callbacks)
        cb_params.network = self._network

        self._clear_metrics()

        if context.get_context("device_target") == "CPU" and dataset_sink_mode:
            dataset_sink_mode = False
            logger.warning(
                "CPU cannot support dataset sink mode currently."
                "So the evaluating process will be performed with dataset non-sink mode."
            )

        with _CallbackManager(callbacks) as list_callback:
            if dataset_sink_mode:
                return self._eval_dataset_sink_process(valid_dataset,
                                                       list_callback,
                                                       cb_params)
            return self._eval_process(valid_dataset, list_callback, cb_params)
Exemple #23
0
    def _train(self,
               epoch,
               train_dataset,
               callbacks=None,
               dataset_sink_mode=True,
               sink_size=-1):
        """
        Training.

        Args:
            epoch (int): Total number of iterations on the data.
            train_dataset (Dataset): A training dataset iterator. If there is no
                                     loss_fn, a tuple with multiple data (data1, data2, data3, ...) will be
                                     returned and passed to the network. Otherwise, a tuple (data, label) will
                                     be returned. The data and label would be passed to the network and loss
                                     function respectively.
            callbacks (list): List of callback objects which should be executed while training. Default: None.
            dataset_sink_mode (bool): Determine whether the data should be passed through the dataset channel.
                                      Default: True.
                                      Configure pynative mode or CPU, the training process will be performed with
                                      dataset not sink.
            sink_size (int): Control the amount of data in each sink. Default: -1.
        """
        epoch = Validator.check_positive_int(epoch)
        if self._parameter_broadcast:
            self._train_network.set_broadcast_flag()

        cb_params = _InternalCallbackParam()
        cb_params.train_network = self._train_network
        cb_params.epoch_num = epoch
        if dataset_sink_mode and sink_size > 0:
            cb_params.batch_num = sink_size
        else:
            cb_params.batch_num = train_dataset.get_dataset_size()
        cb_params.mode = "train"
        cb_params.loss_fn = self._loss_fn
        cb_params.optimizer = self._optimizer
        cb_params.parallel_mode = self._parallel_mode
        cb_params.device_number = self._device_number
        cb_params.train_dataset = train_dataset
        cb_params.list_callback = self._transform_callbacks(callbacks)
        if context.get_context("mode") == context.PYNATIVE_MODE:
            cb_params.list_callback.insert(0, _StepSync())
            callbacks = cb_params.list_callback
        cb_params.train_dataset_element = None
        cb_params.network = self._network
        if _is_role_pserver() or _is_role_sched():
            epoch = 1

        # build callback list
        with _CallbackManager(callbacks) as list_callback:
            self._check_reuse_dataset(train_dataset)
            if not dataset_sink_mode:
                self._train_process(epoch, train_dataset, list_callback,
                                    cb_params)
            elif context.get_context("device_target") == "CPU":
                logger.warning(
                    "The CPU cannot support dataset sink mode currently."
                    "So the training process will be performed with dataset not sink."
                )
                self._train_process(epoch, train_dataset, list_callback,
                                    cb_params)
            else:
                self._train_dataset_sink_process(epoch, train_dataset,
                                                 list_callback, cb_params,
                                                 sink_size)
Exemple #24
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride,
                 pad_mode,
                 padding,
                 dilation,
                 group,
                 has_bias,
                 weight_init,
                 bias_init,
                 transposed=False):
        super(_Conv, self).__init__()
        self.in_channels = check_int_positive(in_channels)
        self.out_channels = check_int_positive(out_channels)
        self.kernel_size = kernel_size
        self.stride = stride
        self.pad_mode = pad_mode
        self.weight_init = weight_init
        self.bias_init = bias_init
        if isinstance(padding, int):
            Validator.check_integer('padding', padding, 0, Rel.GE, self.cls_name)
            self.padding = padding
        elif isinstance(padding, tuple):
            for pad in padding:
                Validator.check_integer('padding item', pad, 0, Rel.GE, self.cls_name)
            self.padding = padding
        else:
            raise TypeError("padding type must be int/tuple(int) cannot be {}!".format(type(padding)))

        self.dilation = dilation
        self.group = check_int_positive(group)
        self.has_bias = has_bias
        if (not isinstance(kernel_size[0], int)) or (not isinstance(kernel_size[1], int)) or \
            kernel_size[0] < 1 or kernel_size[1] < 1:
            raise ValueError("Attr 'kernel_size' of 'Conv2D' Op passed "
                             + str(self.kernel_size) + ", should be a int or tuple and equal to or greater than 1.")
        if (not isinstance(stride[0], int)) or (not isinstance(stride[1], int)) or stride[0] < 1 or stride[1] < 1:
            raise ValueError("Attr 'stride' of 'Conv2D' Op passed "
                             + str(self.stride) + ", should be a int or tuple and equal to or greater than 1.")
        if (not isinstance(dilation[0], int)) or (not isinstance(dilation[1], int)) or \
            dilation[0] < 1 or dilation[1] < 1:
            raise ValueError("Attr 'dilation' of 'Conv2D' Op passed "
                             + str(self.dilation) + ", should equal to or greater than 1.")
        if in_channels % group != 0:
            raise ValueError("Attr 'in_channels' of 'Conv2D' Op must be divisible by "
                             "attr 'group' of 'Conv2D' Op.")
        if out_channels % group != 0:
            raise ValueError("Attr 'out_channels' of 'Conv2D' Op must be divisible by "
                             "attr 'group' of 'Conv2D' Op.")
        if transposed:
            shape = [in_channels, out_channels // group, *kernel_size]
        else:
            shape = [out_channels, in_channels // group, *kernel_size]
        self.weight = Parameter(initializer(self.weight_init, shape), name='weight')

        if check_bool(has_bias):
            self.bias = Parameter(initializer(self.bias_init, [out_channels]), name='bias')
        else:
            if self.bias_init != 'zeros':
                logger.warning("Value of 'has_bias' is False, value of 'bias_init' will be ignored.")
            self.bias = None
Exemple #25
0
import os
from string import punctuation

import numpy as np
import pytest
from mindspore import log as logger
from mindspore.mindrecord import FileReader
from mindspore.mindrecord import TFRecordToMR

SupportedTensorFlowVersion = '1.13.0-rc1'

try:
    tf = import_module(
        "tensorflow")  # just used to convert tfrecord to mindrecord
except ModuleNotFoundError:
    logger.warning("tensorflow module not found.")
    tf = None

TFRECORD_DATA_DIR = "../data/mindrecord/testTFRecordData"
TFRECORD_FILE_NAME = "test.tfrecord"
MINDRECORD_FILE_NAME = "test.mindrecord"
PARTITION_NUM = 1


def cast_name(key):
    """
    Cast schema names which containing special characters to valid names.

    Here special characters means any characters in
    '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~
    Valid names can only contain a-z, A-Z, and 0-9 and _
def run_pretrain(args_opt):
    """pre-train bert"""
    global device_id
    global device_num
    global rank_id
    global job_id
    args_opt.device_id = device_id
    args_opt.device_num = device_num
    sync_dataset(args_opt.data_url)

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() * new_repeat_count,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * new_repeat_count,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    print("Enable save checkpoint: ", args_opt.enable_save_ckpt)
    print("Rank ID: ", rank_id)
    if args_opt.enable_save_ckpt == "true" and rank_id % device_num == 0:
        print("Enable save checkpoint")
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=ckpt_save_dir,
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Exemple #27
0
def run_squad():
    """run squad task"""
    parser = argparse.ArgumentParser(description="run classifier")
    parser.add_argument("--device_target", type=str, default="Ascend", help="Device type, default is Ascend")
    parser.add_argument("--do_train", type=str, default="false", help="Eable train, default is false")
    parser.add_argument("--do_eval", type=str, default="false", help="Eable eval, default is false")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--epoch_num", type=int, default="1", help="Epoch number, default is 1.")
    parser.add_argument("--num_class", type=int, default="2", help="The number of class, default is 2.")
    parser.add_argument("--train_data_shuffle", type=str, default="true",
                        help="Enable train data shuffle, default is true")
    parser.add_argument("--eval_data_shuffle", type=str, default="false",
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path")
    parser.add_argument("--eval_json_path", type=str, default="", help="Evaluation json file path, can be eval.json")
    parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--train_data_file_path", type=str, default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--eval_data_file_path", type=str, default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_file_path", type=str, default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()
    epoch_num = args_opt.epoch_num
    load_pretrain_checkpoint_path = args_opt.load_pretrain_checkpoint_path
    save_finetune_checkpoint_path = args_opt.save_finetune_checkpoint_path
    load_finetune_checkpoint_path = args_opt.load_finetune_checkpoint_path

    if args_opt.do_train.lower() == "false" and args_opt.do_eval.lower() == "false":
        raise ValueError("At least one of 'do_train' or 'do_eval' must be true")
    if args_opt.do_train.lower() == "true" and args_opt.train_data_file_path == "":
        raise ValueError("'train_data_file_path' must be set when do finetune task")
    if args_opt.do_eval.lower() == "true":
        if args_opt.eval_data_file_path == "":
            raise ValueError("'eval_data_file_path' must be set when do evaluation task")
        if args_opt.vocab_file_path == "":
            raise ValueError("'vocab_file_path' must be set when do evaluation task")
        if args_opt.eval_json_path == "":
            raise ValueError("'tokenization_file_path' must be set when do evaluation task")


    target = args_opt.device_target
    if target == "Ascend":
        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
    elif target == "GPU":
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        if bert_net_cfg.compute_type != mstype.float32:
            logger.warning('GPU only support fp32 temporarily, run with fp32.')
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")

    netwithloss = BertSquad(bert_net_cfg, True, 2, dropout_prob=0.1)

    if args_opt.do_train.lower() == "true":
        ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1,
                                  data_file_path=args_opt.train_data_file_path,
                                  schema_file_path=args_opt.schema_file_path,
                                  do_shuffle=(args_opt.train_data_shuffle.lower() == "true"))
        do_train(ds, netwithloss, load_pretrain_checkpoint_path, save_finetune_checkpoint_path, epoch_num)
        if args_opt.do_eval.lower() == "true":
            if save_finetune_checkpoint_path == "":
                load_finetune_checkpoint_dir = _cur_dir
            else:
                load_finetune_checkpoint_dir = make_directory(save_finetune_checkpoint_path)
            load_finetune_checkpoint_path = LoadNewestCkpt(load_finetune_checkpoint_dir,
                                                           ds.get_dataset_size(), epoch_num, "squad")

    if args_opt.do_eval.lower() == "true":
        ds = create_squad_dataset(batch_size=bert_net_cfg.batch_size, repeat_count=1,
                                  data_file_path=args_opt.eval_data_file_path,
                                  schema_file_path=args_opt.schema_file_path, is_training=False,
                                  do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, args_opt.vocab_file_path, args_opt.eval_json_path,
                load_finetune_checkpoint_path, bert_net_cfg.seq_length)
Exemple #28
0
def parse_args():
    """
    parse args
    """
    parser = argparse.ArgumentParser(description='tinybert task distill')
    parser.add_argument(
        "--device_target",
        type=str,
        default="Ascend",
        choices=['Ascend', 'GPU', 'CPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--do_train",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Do train task, default is true.")
    parser.add_argument("--do_eval",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Do eval task, default is true.")
    parser.add_argument("--td_phase1_epoch_size",
                        type=int,
                        default=10,
                        help="Epoch size for td phase 1, default is 10.")
    parser.add_argument("--td_phase2_epoch_size",
                        type=int,
                        default=3,
                        help="Epoch size for td phase 2, default is 3.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        choices=["true", "false"],
                        help="Enable data sink, default is true.")
    parser.add_argument("--save_ckpt_step",
                        type=int,
                        default=100,
                        help="Enable data sink, default is true.")
    parser.add_argument("--max_ckpt_num",
                        type=int,
                        default=1,
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default=1,
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--load_teacher_ckpt_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_gd_ckpt_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--load_td1_ckpt_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--train_data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--eval_data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    parser.add_argument("--task_type",
                        type=str,
                        default="classification",
                        choices=["classification", "ner"],
                        help="The type of the task to train.")
    parser.add_argument("--task_name",
                        type=str,
                        default="",
                        choices=["SST-2", "QNLI", "MNLI", "TNEWS", "CLUENER"],
                        help="The name of the task to train.")
    parser.add_argument(
        "--assessment_method",
        type=str,
        default="accuracy",
        choices=["accuracy", "bf1", "mf1"],
        help=
        "assessment_method include: [accuracy, bf1, mf1], default is accuracy")
    parser.add_argument(
        "--dataset_type",
        type=str,
        default="tfrecord",
        help="dataset type tfrecord/mindrecord, default is tfrecord")
    args = parser.parse_args()
    if args.do_train.lower() != "true" and args.do_eval.lower() != "true":
        raise ValueError(
            "do train or do eval must have one be true, please confirm your config"
        )
    if args.task_name in ["SST-2", "QNLI", "MNLI", "TNEWS"
                          ] and args.task_type != "classification":
        raise ValueError(
            f"{args.task_name} is a classification dataset, please set --task_type=classification"
        )
    if args.task_name in ["CLUENER"] and args.task_type != "ner":
        raise ValueError(
            f"{args.task_name} is a ner dataset, please set --task_type=ner")
    if args.task_name in ["SST-2", "QNLI", "MNLI"] and \
        (td_teacher_net_cfg.vocab_size != 30522 or td_student_net_cfg.vocab_size != 30522):
        logger.warning(f"{args.task_name} is an English dataset. Usually, we use 21128 for CN vocabs and 30522 for "\
                       "EN vocabs according to the origin paper.")
    if args.task_name in ["TNEWS", "CLUENER"] and \
            (td_teacher_net_cfg.vocab_size != 21128 or td_student_net_cfg.vocab_size != 21128):
        logger.warning(f"{args.task_name} is a Chinese dataset. Usually, we use 21128 for CN vocabs and 30522 for " \
                       "EN vocabs according to the origin paper.")
    return args
Exemple #29
0
    print("==============================================================")
    eval_result_print(args_opt.assessment_method, callback)
    print("==============================================================")


if __name__ == '__main__':
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        reserve_class_name_in_scope=False)
    if args_opt.device_target == "Ascend":
        context.set_context(device_id=args_opt.device_id)
    enable_loss_scale = True
    if args_opt.device_target == "GPU":
        if td_student_net_cfg.compute_type != mstype.float32:
            logger.warning(
                'Compute about the student only support float32 temporarily, run with float32.'
            )
            td_student_net_cfg.compute_type = mstype.float32
        # Backward of the network are calculated using fp32,
        # and the loss scale is not necessary
        enable_loss_scale = False

    if args_opt.device_target == "CPU":
        logger.warning(
            'CPU only support float32 temporarily, run with float32.')
        td_teacher_net_cfg.dtype = mstype.float32
        td_teacher_net_cfg.compute_type = mstype.float32
        td_student_net_cfg.dtype = mstype.float32
        td_student_net_cfg.compute_type = mstype.float32
        enable_loss_scale = False
Exemple #30
0
 def check_node_type(self, node):
     if isinstance(node,
                   (de.ShuffleDataset, de.RepeatDataset, de.BatchDataset)):
         logger.warning("Used shuffle, repeat, batch before save operator.")