コード例 #1
0
ファイル: profiling.py プロジェクト: kungfu-ml/mindspore
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str: profiling jon id.
        """

        if self._profiling_job_id:
            return self._profiling_job_id

        job_id = ""
        cmd = "ls -t " + self._output_path + "|grep JOB|awk '{print $1}'"
        r = os.popen(cmd)
        profiling_job_dirs = r.readlines()
        r.close()
        for item in profiling_job_dirs:
            path = os.path.join(self._output_path, item.strip())
            log_file = get_file_names(path, "host_start.log")
            if not log_file:
                logger.error(
                    "Profiling: job path %s, host_start.log not exist.", path)
                continue

            log_file = os.path.join(path, log_file[0])
            item_dict = self._parse_host_start_log(log_file)

            if not item_dict:
                logger.error(
                    "Profiling: job path %s, fail to get job start info.",
                    path)
                continue

            if self._dev_id != item_dict["device_id"]:
                logger.info(
                    "Profiling: job path %s, dev id %s, training device id %s.",
                    path, item_dict["device_id"], self._dev_id)
                continue

            if self._start_time > int(item_dict["start_time"]):
                logger.info(
                    "Profiling: job path %s, start_time %s, training start_time %d.",
                    path, item_dict["start_time"], self._start_time)
                break

            job_id = item.strip()
            break

        if not job_id:
            msg = "Fail to get profiling job, please check whether job dir was generated"
            raise RuntimeError(msg)

        return job_id
コード例 #2
0
ファイル: profiling.py プロジェクト: peixinhou/mindspore
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str, profiling job id.
        """

        job_id = ""

        for item in os.listdir(self._output_path):
            if item.startswith('JOB'):
                path = os.path.join(self._output_path, item)

                log_file = get_file_names(path, "host_start.log")
                if not log_file:
                    logger.error(
                        "Profiling: job path %s, host_start.log not exist.",
                        path)
                    continue

                training_device_id = log_file[0].split('.')[-1]
                if self._dev_id == training_device_id:
                    log_file = os.path.join(path, log_file[0])
                    job_start_time = self._parse_host_start_log(log_file)
                    if not job_start_time:
                        logger.error(
                            "Profiling: job path %s, fail to get job start info.",
                            path)
                        break
                    job_id = item
                    if self._start_time > int(job_start_time):
                        logger.info(
                            "Profiling: job path %s, start_time %s, training start_time %d.",
                            path, job_start_time, self._start_time)
                    break
                else:
                    logger.info(
                        "Profiling: job path %s, dev id %s, training device id %s.",
                        path, training_device_id, self._dev_id)

        if not job_id:
            msg = "Fail to get profiling job, please check whether job dir was generated, " \
                  "or may be the device id from job dir dismatch the device_id in current process."
            raise RuntimeError(msg)

        return job_id
コード例 #3
0
ファイル: profiling.py プロジェクト: xiaoxiugege/mindspore
    def _get_profiling_job_id(self):
        """Get profiling job id, which was generated by ada service.

        Returns:
            str, profiling job id.
        """

        job_id = ""
        for item in os.listdir(self._output_path):
            if item.startswith('JOB'):
                path = os.path.join(self._output_path, item)

                log_file = get_file_names(path, "host_start.log")
                if not log_file:
                    logger.error(
                        "Profiling: job path %s, host_start.log not exist.",
                        path)
                    break

                log_file = os.path.join(path, log_file[0])
                item_dict = self._parse_host_start_log(log_file)

                if not item_dict:
                    logger.error(
                        "Profiling: job path %s, fail to get job start info.",
                        path)
                    break

                job_id = item

                if self._dev_id != item_dict["device_id"]:
                    logger.info(
                        "Profiling: job path %s, dev id %s, training device id %s.",
                        path, item_dict["device_id"], self._dev_id)

                if self._start_time > int(item_dict["start_time"]):
                    logger.info(
                        "Profiling: job path %s, start_time %s, training start_time %d.",
                        path, item_dict["start_time"], self._start_time)
                break

        if not job_id:
            msg = "Fail to get profiling job, please check whether job dir was generated"
            raise RuntimeError(msg)

        return job_id