Ejemplo n.º 1
0
    def _main(self) -> Tuple[str, int]:
        """
        Get CPU power from constant mode

        :return: model name (str), power in Watt (int)
        """
        cpu_model_detected = detect_cpu_model()

        if cpu_model_detected:
            power = self._get_cpu_power_from_registry(cpu_model_detected)

            if power:
                logger.debug(
                    f"CPU : We detect a {cpu_model_detected} with a TDP of {power} W"
                )
                return cpu_model_detected, power
            else:
                logger.warning(
                    f"We saw that you have a {cpu_model_detected} but we don't know it."
                    + " Please contact us.")
                return cpu_model_detected, None
        else:
            logger.warning(
                "We were unable to detect your CPU using the `cpuinfo` package."
                + " Resorting to a default power consumption of 85W.")
        return "Unknown", None
Ejemplo n.º 2
0
    def _log_values(self):
        """
        Logs output from Intel Power Gadget command line to a file
        """
        returncode = None
        if self._system.startswith("win"):
            returncode = subprocess.call(
                [
                    self._cli,
                    "-duration",
                    str(self._duration),
                    "-resolution",
                    str(self._resolution),
                    "-file",
                    self._log_file_path,
                ],
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
        elif self._system.startswith("darwin"):
            returncode = subprocess.call(
                f"'{self._cli}' -duration {self._duration} -resolution {self._resolution} -file {self._log_file_path} > /dev/null",  # noqa: E501
                shell=True,
            )
        else:
            return None

        if returncode != 0:
            logger.warning("Returncode while logging power values using " +
                           f"Intel Power Gadget: {returncode}")
        return
Ejemplo n.º 3
0
    def get_country_emissions(self, energy: Energy, geo: GeoMetadata) -> float:
        """
        Computes emissions for a country on private infra,
        given a quantity of power consumed by
        using data for the mix of energy sources of that country.
        :param energy: Mean power consumption of the process (kWh)
        :param geo: Country and region metadata
        :return: CO2 emissions in kg
        """
        energy_mix = self._data_source.get_global_energy_mix_data()

        if geo.country_iso_code not in energy_mix:
            logger.warning(
                f"We do not have data for {geo.country_iso_code}, using world average."
            )
            carbon_intensity_per_source = (
                DataSource().get_carbon_intensity_per_source_data())
            return (EmissionsPerKWh.from_g_per_kWh(
                carbon_intensity_per_source.get("world_average")).kgs_per_kWh *
                    energy.kWh)  # kgs

        country_energy_mix: Dict = energy_mix[geo.country_iso_code]

        emissions_per_kWh = self._global_energy_mix_to_emissions_rate(
            country_energy_mix)
        logger.debug(
            f"We apply an energy mix of {emissions_per_kWh.kgs_per_kWh*1000:.0f}"
            + f" g.CO2eq/kWh for {geo.country_name}")

        return emissions_per_kWh.kgs_per_kWh * energy.kWh  # kgs
Ejemplo n.º 4
0
    def get_private_infra_emissions(self, energy: Energy,
                                    geo: GeoMetadata) -> float:
        """
        Computes emissions for private infra
        :param energy: Mean power consumption of the process (kWh)
        :param geo: Country and region metadata
        :return: CO2 emissions in kg
        """
        if self._co2_signal_api_token:
            try:
                return co2_signal.get_emissions(energy, geo,
                                                self._co2_signal_api_token)
            except Exception as e:
                logger.error("co2_signal.get_emissions: " + str(e) +
                             " >>> Using CodeCarbon's data.")

        compute_with_regional_data: bool = (geo.region is not None) and (
            geo.country_iso_code.upper() in ["USA", "CAN"])

        if compute_with_regional_data:
            try:
                return self.get_region_emissions(energy, geo)
            except Exception as e:
                logger.error(e, exc_info=True)
                logger.warning("Regional emissions retrieval failed." +
                               " Falling back on country emissions.")
        return self.get_country_emissions(energy, geo)
Ejemplo n.º 5
0
    def out(self, data: EmissionsData):
        file_exists: bool = os.path.isfile(self.save_file_path)
        if file_exists and not self.has_valid_headers(data):
            logger.info("Backing up old emission file")
            backup(self.save_file_path)
            file_exists = False

        if not file_exists:
            df = pd.DataFrame(columns=data.values.keys())
            df = pd.concat(
                [df, pd.DataFrame.from_records([dict(data.values)])])
        elif self.on_csv_write == "append":
            df = pd.read_csv(self.save_file_path)
            df = pd.concat(
                [df, pd.DataFrame.from_records([dict(data.values)])])
        else:
            df = pd.read_csv(self.save_file_path)
            df_run = df.loc[df.run_id == data.run_id]
            if len(df_run) < 1:
                df = pd.concat(
                    [df, pd.DataFrame.from_records([dict(data.values)])])
            elif len(df_run) > 1:
                logger.warning(f"CSV contains more than 1 ({len(df_run)})" +
                               f" rows with current run ID ({data.run_id})." +
                               "Appending instead of updating.")
                df = pd.concat(
                    [df, pd.DataFrame.from_records([dict(data.values)])])
            else:
                df.at[df.run_id == data.run_id,
                      data.values.keys()] = data.values.values()

        df.to_csv(self.save_file_path, index=False)
Ejemplo n.º 6
0
 def slurm_memory_GB(self):
     scontrol_str = self._read_slurm_scontrol()
     if scontrol_str is None:
         logger.warning("Error running `scontrol show job $SLURM_JOBID` " +
                        "to retrieve SLURM-available RAM." +
                        "Using the machine's total RAM.")
         return psutil.virtual_memory().total / B_TO_GB
     mem = self._parse_scontrol(scontrol_str)
     if isinstance(mem, str):
         return self._parse_scontrol_memory_GB(mem)
     return mem
Ejemplo n.º 7
0
 def out(self, data: EmissionsData):
     try:
         payload = dataclasses.asdict(data)
         payload["user"] = getpass.getuser()
         resp = requests.post(self.endpoint_url, json=payload, timeout=10)
         if resp.status_code != 201:
             logger.warning(
                 "HTTP Output returned an unexpected status code: ",
                 resp,
             )
     except Exception as e:
         logger.error(e, exc_info=True)
Ejemplo n.º 8
0
    def start(self) -> None:
        """
        Starts tracking the experiment.
        Currently, Nvidia GPUs are supported.
        :return: None
        """
        if self._start_time is not None:
            logger.warning("Already started tracking")
            return

        self._last_measured_time = self._start_time = time.time()
        # Read initial energy for hardware
        for hardware in self._hardware:
            hardware.start()

        self._scheduler.start()
Ejemplo n.º 9
0
    def _parse_scontrol(self, scontrol_str):
        mem_matches = re.findall(r"mem=\d+[A-Z]", scontrol_str)
        if len(mem_matches) == 0:
            logger.warning(
                "Could not find mem= after running `scontrol show job $SLURM_JOBID` "
                +
                "to count SLURM-available RAM. Using the machine's total RAM.")
            return psutil.virtual_memory().total / B_TO_GB
        if len(mem_matches) > 1:
            logger.warning(
                "Unexpected output after running `scontrol show job $SLURM_JOBID` "
                +
                "to count SLURM-available RAM. Using the machine's total RAM.")
            return psutil.virtual_memory().total / B_TO_GB

        return mem_matches[0].replace("mem=", "")
Ejemplo n.º 10
0
 def add_emission(self, carbon_emission: dict):
     assert self.experiment_id is not None
     self._previous_call = time.time()
     if self.run_id is None:
         # TODO : raise an Exception ?
         logger.debug(
             "ApiClient.add_emission need a run_id : the initial call may "
             + "have failed. Retrying..."
         )
         self._create_run(self.experiment_id)
         if self.run_id is None:
             logger.error(
                 "ApiClient.add_emission still no run_id, aborting for this time !"
             )
         return False
     if carbon_emission["duration"] < 1:
         logger.warning(
             "ApiClient : emissions not sent because of a duration smaller than 1."
         )
         return False
     emission = EmissionCreate(
         timestamp=get_datetime_with_timezone(),
         run_id=self.run_id,
         duration=int(carbon_emission["duration"]),
         emissions_sum=carbon_emission["emissions"],
         emissions_rate=carbon_emission["emissions_rate"],
         cpu_power=carbon_emission["cpu_power"],
         gpu_power=carbon_emission["gpu_power"],
         ram_power=carbon_emission["ram_power"],
         cpu_energy=carbon_emission["cpu_energy"],
         gpu_energy=carbon_emission["gpu_energy"],
         ram_energy=carbon_emission["ram_energy"],
         energy_consumed=carbon_emission["energy_consumed"],
     )
     try:
         payload = dataclasses.asdict(emission)
         url = self.url + "/emission"
         r = requests.post(url=url, json=payload, timeout=2)
         if r.status_code != 201:
             self._log_error(url, payload, r)
             return False
         logger.debug(f"ApiClient - Successful upload emission {payload} to {url}")
     except Exception as e:
         logger.error(e, exc_info=True)
         return False
     return True
Ejemplo n.º 11
0
    def total_power(self) -> Power:
        """
        Compute the Power (kW) consumed by the current process (and its children if
        `children` was True in __init__)

        Returns:
            Power: kW of power consumption, using self.power_per_GB W/GB
        """
        try:
            memory_GB = (self.machine_memory_GB if self._tracking_mode
                         == "machine" else self.process_memory_GB)
            ram_power = Power.from_watts(memory_GB * self.power_per_GB)
        except Exception as e:
            logger.warning(f"Could not measure RAM Power ({str(e)})")
            ram_power = Power.from_watts(0)

        return ram_power
Ejemplo n.º 12
0
    def from_utils(
        cls,
        output_dir: str,
        mode: str,
        model: Optional[str] = None,
        tdp: Optional[int] = None,
    ) -> "CPU":

        if model is None:
            model = detect_cpu_model()
            if model is None:
                logger.warning("Could not read CPU model.")

        if tdp is None:
            tdp = POWER_CONSTANT
            cpu = cls(output_dir=output_dir, mode=mode, model=model, tdp=tdp)
            cpu._is_generic_tdp = True
            return cpu

        return cls(output_dir=output_dir, mode=mode, model=model, tdp=tdp)
Ejemplo n.º 13
0
def count_cpus() -> int:
    if os.environ.get("SLURM_JOB_ID") is None:
        return psutil.cpu_count()

    try:
        scontrol = subprocess.check_output(
            ["scontrol show job $SLURM_JOBID"], shell=True
        ).decode()
    except subprocess.CalledProcessError:
        logger.warning(
            "Error running `scontrol show job $SLURM_JOBID` "
            + "to count SLURM-available cpus. Using the machine's cpu count."
        )
        return psutil.cpu_count()

    num_cpus_matches = re.findall(r"NumCPUs=\d+", scontrol)

    if len(num_cpus_matches) == 0:
        logger.warning(
            "Could not find NumCPUs= after running `scontrol show job $SLURM_JOBID` "
            + "to count SLURM-available cpus. Using the machine's cpu count."
        )
        return psutil.cpu_count()

    if len(num_cpus_matches) > 1:
        logger.warning(
            "Unexpected output after running `scontrol show job $SLURM_JOBID` "
            + "to count SLURM-available cpus. Using the machine's cpu count."
        )
        return psutil.cpu_count()

    num_cpus = num_cpus_matches[0].replace("NumCPUs=", "")
    return int(num_cpus)
Ejemplo n.º 14
0
def suppress(*exceptions):
    try:
        yield
    except exceptions:
        logger.warning("graceful shutdown. Exceptions:")
        logger.warning(
            exceptions if len(exceptions) != 1 else exceptions[0], exc_info=True
        )
        logger.warning("stopping.")
        pass
Ejemplo n.º 15
0
    def _measure_power_and_energy(self) -> None:
        """
        A function that is periodically run by the `BackgroundScheduler`
        every `self._measure_power_secs` seconds.
        :return: None
        """
        last_duration = time.time() - self._last_measured_time

        warning_duration = self._measure_power_secs * 3
        if last_duration > warning_duration:
            warn_msg = ("Background scheduler didn't run for a long period" +
                        " (%ds), results might be inaccurate")
            logger.warning(warn_msg, last_duration)

        for hardware in self._hardware:
            h_time = time.time()
            # Compute last_duration again for more accuracy
            last_duration = time.time() - self._last_measured_time
            power, energy = hardware.measure_power_and_energy(
                last_duration=last_duration)
            self._total_energy += energy
            if isinstance(hardware, CPU):
                self._total_cpu_energy += energy
                self._cpu_power = power
                logger.info(
                    f"Energy consumed for all CPUs : {self._total_cpu_energy.kWh:.6f} kWh"
                    + f". All CPUs Power : {self._cpu_power.W} W")
            elif isinstance(hardware, GPU):
                self._total_gpu_energy += energy
                self._gpu_power = power
                logger.info(
                    f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh"
                    + f". All GPUs Power : {self._gpu_power.W} W")
            elif isinstance(hardware, RAM):
                self._total_ram_energy += energy
                self._ram_power = power
                logger.info(
                    f"Energy consumed for RAM : {self._total_ram_energy.kWh:.6f} kWh"
                    + f". RAM Power : {self._ram_power.W} W")
            else:
                logger.error(
                    f"Unknown hardware type: {hardware} ({type(hardware)})")
            h_time = time.time() - h_time
            logger.debug(
                f"{hardware.__class__.__name__} : {hardware.total_power().W:,.2f} "
                +
                f"W during {last_duration:,.2f} s [measurement time: {h_time:,.4f}]"
            )
        logger.info(
            f"{self._total_energy.kWh:.6f} kWh of electricity used since the begining."
        )
        self._last_measured_time = time.time()
        self._measure_occurrence += 1
        if self._cc_api__out is not None and self._api_call_interval != -1:
            if self._measure_occurrence >= self._api_call_interval:
                emissions = self._prepare_emissions_data(delta=True)
                logger.info(
                    f"{emissions.emissions_rate:.6f} g.CO2eq/s mean an estimation of "
                    +
                    f"{emissions.emissions_rate*3600*24*365/1000:,} kg.CO2eq/year"
                )
                self._cc_api__out.out(emissions)
                self._measure_occurrence = 0
        logger.debug(
            f"last_duration={last_duration}\n------------------------")
Ejemplo n.º 16
0
    def __init__(
        self,
        project_name: Optional[str] = _sentinel,
        measure_power_secs: Optional[int] = _sentinel,
        api_call_interval: Optional[int] = _sentinel,
        api_endpoint: Optional[str] = _sentinel,
        api_key: Optional[str] = _sentinel,
        output_dir: Optional[str] = _sentinel,
        output_file: Optional[str] = _sentinel,
        save_to_file: Optional[bool] = _sentinel,
        save_to_api: Optional[bool] = _sentinel,
        save_to_logger: Optional[bool] = _sentinel,
        logging_logger: Optional[LoggerOutput] = _sentinel,
        gpu_ids: Optional[List] = _sentinel,
        emissions_endpoint: Optional[str] = _sentinel,
        experiment_id: Optional[str] = _sentinel,
        co2_signal_api_token: Optional[str] = _sentinel,
        tracking_mode: Optional[str] = _sentinel,
        log_level: Optional[Union[int, str]] = _sentinel,
        on_csv_write: Optional[str] = _sentinel,
        logger_preamble: Optional[str] = _sentinel,
    ):
        """
        :param project_name: Project name for current experiment run, default name
                             as "codecarbon"
        :param measure_power_secs: Interval (in seconds) to measure hardware power
                                   usage, defaults to 15
        :param api_call_interval: Occurrence to wait before calling API :
                            -1 : only call api on flush() and at the end.
                            1 : at every measure
                            2 : every 2 measure, etc...
        :param api_endpoint: Optional URL of Code Carbon API endpoint for sending
                             emissions data
        :param api_key: API key for Code Carbon API, mandatory to use it !
        :param output_dir: Directory path to which the experiment details are logged,
                           defaults to current directory
        :param output_file: Name of output CSV file, defaults to `emissions.csv`
        :param save_to_file: Indicates if the emission artifacts should be logged to a
                             file, defaults to True
        :param save_to_api: Indicates if the emission artifacts should be send to the
                            CodeCarbon API, defaults to False
        :param save_to_logger: Indicates if the emission artifacts should be written
                            to a dedicated logger, defaults to False
        :param logging_logger: LoggerOutput object encapsulating a logging.logger
                            or a Google Cloud logger
        :param gpu_ids: User-specified known gpu ids to track, defaults to None
        :param emissions_endpoint: Optional URL of http endpoint for sending emissions
                                   data
        :param experiment_id: Id of the experiment
        :param co2_signal_api_token: API token for co2signal.com (requires sign-up for
                                     free beta)
        :param tracking_mode: One of "process" or "machine" in order to measure the
                              power consumptions due to the entire machine or try and
                              isolate the tracked processe's in isolation.
                              Defaults to "machine"
        :param log_level: Global codecarbon log level. Accepts one of:
                            {"debug", "info", "warning", "error", "critical"}.
                          Defaults to "info".
        :param on_csv_write: "append" or "update". Whether to always append a new line
                             to the csv when writing or to update the existing `run_id`
                             row (useful when calling`tracker.flush()` manually).
                             Accepts one of "append" or "update".
        :param logger_preamble: String to systematically include in the logger's.
                                messages. Defaults to "".
        """

        # logger.info("base tracker init")
        self._external_conf = get_hierarchical_config()

        self._set_from_conf(api_call_interval, "api_call_interval", 8, int)
        self._set_from_conf(api_endpoint, "api_endpoint",
                            "https://api.codecarbon.io")
        self._set_from_conf(co2_signal_api_token, "co2_signal_api_token")
        self._set_from_conf(emissions_endpoint, "emissions_endpoint")
        self._set_from_conf(gpu_ids, "gpu_ids")
        self._set_from_conf(log_level, "log_level", "info")
        self._set_from_conf(measure_power_secs, "measure_power_secs", 15, int)
        self._set_from_conf(output_dir, "output_dir", ".")
        self._set_from_conf(output_file, "output_file", "emissions.csv")
        self._set_from_conf(project_name, "project_name", "codecarbon")
        self._set_from_conf(save_to_api, "save_to_api", False, bool)
        self._set_from_conf(save_to_file, "save_to_file", True, bool)
        self._set_from_conf(save_to_logger, "save_to_logger", False, bool)
        self._set_from_conf(logging_logger, "logging_logger")
        self._set_from_conf(tracking_mode, "tracking_mode", "machine")
        self._set_from_conf(on_csv_write, "on_csv_write", "append")
        self._set_from_conf(logger_preamble, "logger_preamble", "")

        assert self._tracking_mode in ["machine", "process"]
        set_logger_level(self._log_level)
        set_logger_format(self._logger_preamble)

        self._start_time: Optional[float] = None
        self._last_measured_time: float = time.time()
        self._total_energy: Energy = Energy.from_energy(kWh=0)
        self._total_cpu_energy: Energy = Energy.from_energy(kWh=0)
        self._total_gpu_energy: Energy = Energy.from_energy(kWh=0)
        self._total_ram_energy: Energy = Energy.from_energy(kWh=0)
        self._cpu_power: Power = Power.from_watts(watts=0)
        self._gpu_power: Power = Power.from_watts(watts=0)
        self._ram_power: Power = Power.from_watts(watts=0)
        self._cc_api__out = None
        self._measure_occurrence: int = 0
        self._cloud = None
        self._previous_emissions = None
        self._conf["os"] = platform.platform()
        self._conf["python_version"] = platform.python_version()
        self._conf["cpu_count"] = count_cpus()
        self._geo = None

        if isinstance(self._gpu_ids, str):
            self._gpu_ids: List[int] = parse_gpu_ids(self._gpu_ids)
            self._conf["gpu_ids"] = self._gpu_ids
            self._conf["gpu_count"] = len(self._gpu_ids)

        logger.info("[setup] RAM Tracking...")
        ram = RAM(tracking_mode=self._tracking_mode)
        self._conf["ram_total_size"] = ram.machine_memory_GB
        self._hardware: List[Union[RAM, CPU, GPU]] = [ram]

        # Hardware detection
        logger.info("[setup] GPU Tracking...")
        if gpu.is_gpu_details_available():
            logger.info("Tracking Nvidia GPU via pynvml")
            self._hardware.append(GPU.from_utils(self._gpu_ids))
            gpu_names = [n["name"] for n in gpu.get_gpu_static_info()]
            gpu_names_dict = Counter(gpu_names)
            self._conf["gpu_model"] = "".join(
                [f"{i} x {name}" for name, i in gpu_names_dict.items()])
            self._conf["gpu_count"] = len(gpu.get_gpu_static_info())
        else:
            logger.info("No GPU found.")

        logger.info("[setup] CPU Tracking...")
        if cpu.is_powergadget_available():
            logger.info("Tracking Intel CPU via Power Gadget")
            hardware = CPU.from_utils(self._output_dir, "intel_power_gadget")
            self._hardware.append(hardware)
            self._conf["cpu_model"] = hardware.get_model()
        elif cpu.is_rapl_available():
            logger.info("Tracking Intel CPU via RAPL interface")
            hardware = CPU.from_utils(self._output_dir, "intel_rapl")
            self._hardware.append(hardware)
            self._conf["cpu_model"] = hardware.get_model()
        else:
            logger.warning(
                "No CPU tracking mode found. Falling back on CPU constant mode."
            )
            tdp = cpu.TDP()
            power = tdp.tdp
            model = tdp.model
            logger.info(f"CPU Model on constant consumption mode: {model}")
            self._conf["cpu_model"] = model
            if tdp:
                hardware = CPU.from_utils(self._output_dir, "constant", model,
                                          power)
                self._hardware.append(hardware)
            else:
                logger.warning("Failed to match CPU TDP constant. " +
                               "Falling back on a global constant.")
                hardware = CPU.from_utils(self._output_dir, "constant")
                self._hardware.append(hardware)

        self._conf["hardware"] = list(
            map(lambda x: x.description(), self._hardware))

        logger.info(">>> Tracker's metadata:")
        logger.info(f"  Platform system: {self._conf.get('os')}")
        logger.info(f"  Python version: {self._conf.get('python_version')}")
        logger.info(
            f"  Available RAM : {self._conf.get('ram_total_size'):.3f} GB")
        logger.info(f"  CPU count: {self._conf.get('cpu_count')}")
        logger.info(f"  CPU model: {self._conf.get('cpu_model')}")
        logger.info(f"  GPU count: {self._conf.get('gpu_count')}")
        logger.info(f"  GPU model: {self._conf.get('gpu_model')}")

        # Run `self._measure_power` every `measure_power_secs` seconds in a
        # background thread
        self._scheduler = PeriodicScheduler(
            function=self._measure_power_and_energy,
            interval=self._measure_power_secs,
        )

        self._data_source = DataSource()

        cloud: CloudMetadata = self._get_cloud_metadata()

        if cloud.is_on_private_infra:
            self._geo = self._get_geo_metadata()
            self._conf["longitude"] = self._geo.longitude
            self._conf["latitude"] = self._geo.latitude
            self._conf["region"] = cloud.region
            self._conf["provider"] = cloud.provider
        else:
            self._conf["region"] = cloud.region
            self._conf["provider"] = cloud.provider

        self._emissions: Emissions = Emissions(self._data_source,
                                               self._co2_signal_api_token)
        self.persistence_objs: List[BaseOutput] = list()

        if self._save_to_file:
            self.persistence_objs.append(
                FileOutput(
                    os.path.join(self._output_dir, self._output_file),
                    self._on_csv_write,
                ))

        if self._save_to_logger:
            self.persistence_objs.append(self._logging_logger)

        if self._emissions_endpoint:
            self.persistence_objs.append(HTTPOutput(emissions_endpoint))

        if self._save_to_api:
            experiment_id = self._set_from_conf(
                experiment_id, "experiment_id",
                "5b0fa12a-3dd7-45bb-9766-cc326314d9f1")
            self._cc_api__out = CodeCarbonAPIOutput(
                endpoint_url=self._api_endpoint,
                experiment_id=experiment_id,
                api_key=api_key,
                conf=self._conf,
            )
            self.run_id = self._cc_api__out.run_id
            self.persistence_objs.append(self._cc_api__out)

        else:
            self.run_id = uuid.uuid4()