def _fetch_rapl_files(self): """ Fetches RAPL files from the RAPL directory """ # consider files like `intel-rapl:$i` files = list(filter(lambda x: ":" in x, os.listdir(self._lin_rapl_dir))) i = 0 for file in files: path = os.path.join(self._lin_rapl_dir, file, "name") with open(path) as f: name = f.read().strip() # Fake the name used by Power Gadget if "package" in name: name = f"Processor Energy Delta_{i}(kWh)" i += 1 rapl_file = os.path.join(self._lin_rapl_dir, file, "energy_uj") try: # Try to read the file to be sure we can with open(rapl_file, "r") as f: _ = float(f.read()) self._rapl_files.append(RAPLFile(name=name, path=rapl_file)) logger.debug( f"We will read Intel RAPL files at {rapl_file}") except PermissionError as e: logger.error( "Unable to read Intel RAPL files for CPU power, we will use a constant for your CPU power." + " Please view https://github.com/mlco2/codecarbon/issues/244" + f" for workarounds : {e}") return
def _main(self) -> Tuple[str, int]: """ Get CPU power from constant mode :return: model name (str), power in Watt (int) """ cpu_model_detected = detect_cpu_model() if cpu_model_detected: power = self._get_cpu_power_from_registry(cpu_model_detected) if power: logger.debug( f"CPU : We detect a {cpu_model_detected} with a TDP of {power} W" ) return cpu_model_detected, power else: logger.warning( f"We saw that you have a {cpu_model_detected} but we don't know it." + " Please contact us.") return cpu_model_detected, None else: logger.warning( "We were unable to detect your CPU using the `cpuinfo` package." + " Resorting to a default power consumption of 85W.") return "Unknown", None
def get_static_cpu_details(self) -> Dict: """ Return CPU details without computing them. """ logger.debug(f"get_static_cpu_details {self.cpu_details}") return self.cpu_details
def get_cpu_details(self, duration: Time, **kwargs) -> Dict: """ Fetches the CPU Energy Deltas by fetching values from RAPL files """ cpu_details = dict() try: list( map(lambda rapl_file: rapl_file.delta(duration), self._rapl_files)) for rapl_file in self._rapl_files: logger.debug(rapl_file) cpu_details[rapl_file.name] = rapl_file.energy_delta.kWh # We fake the name used by Power Gadget when using RAPL if "Energy" in rapl_file.name: cpu_details[rapl_file.name.replace( "Energy", "Power")] = rapl_file.power.W except Exception as e: logger.info( f"Unable to read Intel RAPL files at {self._rapl_files}\n \ Exception occurred {e}", exc_info=True, ) self.cpu_details = cpu_details logger.debug(f"get_cpu_details {self.cpu_details}") return cpu_details
def _get_power_from_cpus(self) -> Power: """ Get CPU power :return: power in kW """ if self._mode == "constant": power = self._tdp * CONSUMPTION_PERCENTAGE_CONSTANT return Power.from_watts(power) elif self._mode == "intel_rapl": # Don't call get_cpu_details to avoid computing energy twice and loosing data. all_cpu_details: Dict = self._intel_interface.get_static_cpu_details( ) else: all_cpu_details: Dict = self._intel_interface.get_cpu_details() power = 0 for metric, value in all_cpu_details.items(): # "^Processor Power_\d+\(Watt\)$" for Inter Power Gadget if re.match(r"^Processor Power", metric): power += value logger.debug( f"_get_power_from_cpus - MATCH {metric} : {value}") else: logger.debug( f"_get_power_from_cpus - DONT MATCH {metric} : {value}") return Power.from_watts(power)
def get_country_emissions(self, energy: Energy, geo: GeoMetadata) -> float: """ Computes emissions for a country on private infra, given a quantity of power consumed by using data for the mix of energy sources of that country. :param energy: Mean power consumption of the process (kWh) :param geo: Country and region metadata :return: CO2 emissions in kg """ energy_mix = self._data_source.get_global_energy_mix_data() if geo.country_iso_code not in energy_mix: logger.warning( f"We do not have data for {geo.country_iso_code}, using world average." ) carbon_intensity_per_source = ( DataSource().get_carbon_intensity_per_source_data()) return (EmissionsPerKWh.from_g_per_kWh( carbon_intensity_per_source.get("world_average")).kgs_per_kWh * energy.kWh) # kgs country_energy_mix: Dict = energy_mix[geo.country_iso_code] emissions_per_kWh = self._global_energy_mix_to_emissions_rate( country_energy_mix) logger.debug( f"We apply an energy mix of {emissions_per_kWh.kgs_per_kWh*1000:.0f}" + f" g.CO2eq/kWh for {geo.country_name}") return emissions_per_kWh.kgs_per_kWh * energy.kWh # kgs
def is_rapl_available(): try: IntelRAPL() return True except Exception as e: logger.debug( "Not using the RAPL interface, an exception occurred while instantiating " + f"IntelRAPL : {e}", ) return False
def is_powergadget_available(): try: IntelPowerGadget() return True except Exception as e: logger.debug( "Not using PowerGadget, an exception occurred while instantiating" + f" IntelPowerGadget : {e}", ) return False
def get_gpu_details(): """Get all GPUs instantaneous metrics >>> get_gpu_details() [ { "name": "Tesla V100-SXM2-16GB", "uuid": "GPU-4e817856-1fb8-192a-7ab7-0e0e4476c184", "free_memory": 16945381376, "total_memory": 16945512448, "used_memory": 131072, "temperature": 28, "power_usage": 42159, "power_limit": 300000, "gpu_utilization": 0, "compute_mode": 0, "compute_processes": [], "graphics_processes": [], } ] """ try: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() devices = [] for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) # Memory memory = get_memory_info(handle) device_details = { "name": get_gpu_name(handle), "uuid": get_uuid(handle), "free_memory": memory.free, "total_memory": memory.total, "used_memory": memory.used, "temperature": get_temperature(handle), "power_usage": get_power_usage(handle), "power_limit": get_power_limit(handle), "gpu_utilization": get_gpu_utilization(handle), "compute_mode": get_compute_mode(handle), "compute_processes": get_compute_processes(handle), "graphics_processes": get_graphics_processes(handle), } devices.append(device_details) return devices except pynvml.NVMLError: logger.debug("Failed to retrieve gpu information", exc_info=True) return []
def add_emission(self, carbon_emission: dict): assert self.experiment_id is not None self._previous_call = time.time() if self.run_id is None: # TODO : raise an Exception ? logger.debug( "ApiClient.add_emission need a run_id : the initial call may " + "have failed. Retrying..." ) self._create_run(self.experiment_id) if self.run_id is None: logger.error( "ApiClient.add_emission still no run_id, aborting for this time !" ) return False if carbon_emission["duration"] < 1: logger.warning( "ApiClient : emissions not sent because of a duration smaller than 1." ) return False emission = EmissionCreate( timestamp=get_datetime_with_timezone(), run_id=self.run_id, duration=int(carbon_emission["duration"]), emissions_sum=carbon_emission["emissions"], emissions_rate=carbon_emission["emissions_rate"], cpu_power=carbon_emission["cpu_power"], gpu_power=carbon_emission["gpu_power"], ram_power=carbon_emission["ram_power"], cpu_energy=carbon_emission["cpu_energy"], gpu_energy=carbon_emission["gpu_energy"], ram_energy=carbon_emission["ram_energy"], energy_consumed=carbon_emission["energy_consumed"], ) try: payload = dataclasses.asdict(emission) url = self.url + "/emission" r = requests.post(url=url, json=payload, timeout=2) if r.status_code != 201: self._log_error(url, payload, r) return False logger.debug(f"ApiClient - Successful upload emission {payload} to {url}") except Exception as e: logger.error(e, exc_info=True) return False return True
def get_env_cloud_details(timeout=1): # type: (int) -> Optional[Any] """ >>> get_env_cloud_details() {'provider': 'AWS', 'metadata': {'accountId': '26550917306', 'architecture': 'x86_64', 'availabilityZone': 'us-east-1b', 'billingProducts': None, 'devpayProductCodes': None, 'marketplaceProductCodes': None, 'imageId': 'ami-025ed45832b817a35', 'instanceId': 'i-7c3e81fed58d8f7f7', 'instanceType': 'g4dn.2xlarge', 'kernelId': None, 'pendingTime': '2020-01-23T20:44:53Z', 'privateIp': '172.156.72.143', 'ramdiskId': None, 'region': 'us-east-1', 'version': '2017-09-30'}} """ for provider in CLOUD_METADATA_MAPPING.keys(): try: params = CLOUD_METADATA_MAPPING[provider] response = requests.get(params["url"], headers=params["headers"], timeout=timeout) response.raise_for_status() response_data = response.json() postprocess_function = params.get("postprocess_function") if postprocess_function is not None: response_data = postprocess_function(response_data) return {"provider": provider, "metadata": response_data} except Exception as e: logger.debug("Not running on %s, couldn't retrieve metadata: %r", provider, e) return None
def get_gpu_static_info(): """Get all GPUs static information. >>> get_gpu_static_info() [ { "name": "Tesla V100-SXM2-16GB", "uuid": "GPU-4e817856-1fb8-192a-7ab7-0e0e4476c184", "total_memory": 16945512448, "power_limit": 300000, "gpu_index": 0, } ] """ try: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() devices = [] for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) # Memory memory = get_memory_info(handle) device_details = { "name": get_gpu_name(handle), "uuid": get_uuid(handle), "total_memory": memory.total, "power_limit": get_power_limit(handle), "gpu_index": i, } devices.append(device_details) return devices except pynvml.NVMLError: logger.debug("Failed to retrieve gpu static info", exc_info=True) return []
def _measure_power_and_energy(self) -> None: """ A function that is periodically run by the `BackgroundScheduler` every `self._measure_power_secs` seconds. :return: None """ last_duration = time.time() - self._last_measured_time warning_duration = self._measure_power_secs * 3 if last_duration > warning_duration: warn_msg = ("Background scheduler didn't run for a long period" + " (%ds), results might be inaccurate") logger.warning(warn_msg, last_duration) for hardware in self._hardware: h_time = time.time() # Compute last_duration again for more accuracy last_duration = time.time() - self._last_measured_time power, energy = hardware.measure_power_and_energy( last_duration=last_duration) self._total_energy += energy if isinstance(hardware, CPU): self._total_cpu_energy += energy self._cpu_power = power logger.info( f"Energy consumed for all CPUs : {self._total_cpu_energy.kWh:.6f} kWh" + f". All CPUs Power : {self._cpu_power.W} W") elif isinstance(hardware, GPU): self._total_gpu_energy += energy self._gpu_power = power logger.info( f"Energy consumed for all GPUs : {self._total_gpu_energy.kWh:.6f} kWh" + f". All GPUs Power : {self._gpu_power.W} W") elif isinstance(hardware, RAM): self._total_ram_energy += energy self._ram_power = power logger.info( f"Energy consumed for RAM : {self._total_ram_energy.kWh:.6f} kWh" + f". RAM Power : {self._ram_power.W} W") else: logger.error( f"Unknown hardware type: {hardware} ({type(hardware)})") h_time = time.time() - h_time logger.debug( f"{hardware.__class__.__name__} : {hardware.total_power().W:,.2f} " + f"W during {last_duration:,.2f} s [measurement time: {h_time:,.4f}]" ) logger.info( f"{self._total_energy.kWh:.6f} kWh of electricity used since the begining." ) self._last_measured_time = time.time() self._measure_occurrence += 1 if self._cc_api__out is not None and self._api_call_interval != -1: if self._measure_occurrence >= self._api_call_interval: emissions = self._prepare_emissions_data(delta=True) logger.info( f"{emissions.emissions_rate:.6f} g.CO2eq/s mean an estimation of " + f"{emissions.emissions_rate*3600*24*365/1000:,} kg.CO2eq/year" ) self._cc_api__out.out(emissions) self._measure_occurrence = 0 logger.debug( f"last_duration={last_duration}\n------------------------")
def _prepare_emissions_data(self, delta=False) -> EmissionsData: """ :delta: True to return only the delta comsumption since last call """ cloud: CloudMetadata = self._get_cloud_metadata() duration: Time = Time.from_seconds(time.time() - self._start_time) if cloud.is_on_private_infra: emissions = self._emissions.get_private_infra_emissions( self._total_energy, self._geo) # float: kg co2_eq country_name = self._geo.country_name country_iso_code = self._geo.country_iso_code region = self._geo.region on_cloud = "N" cloud_provider = "" cloud_region = "" else: emissions = self._emissions.get_cloud_emissions( self._total_energy, cloud) country_name = self._emissions.get_cloud_country_name(cloud) country_iso_code = self._emissions.get_cloud_country_iso_code( cloud) region = self._emissions.get_cloud_geo_region(cloud) on_cloud = "Y" cloud_provider = cloud.provider cloud_region = cloud.region total_emissions = EmissionsData( timestamp=datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), project_name=self._project_name, run_id=str(self.run_id), duration=duration.seconds, emissions=emissions, emissions_rate=emissions * 1000 / duration.seconds, # g/s cpu_power=self._cpu_power.W, gpu_power=self._gpu_power.W, ram_power=self._ram_power.W, cpu_energy=self._total_cpu_energy.kWh, gpu_energy=self._total_gpu_energy.kWh, ram_energy=self._total_ram_energy.kWh, energy_consumed=self._total_energy.kWh, country_name=country_name, country_iso_code=country_iso_code, region=region, on_cloud=on_cloud, cloud_provider=cloud_provider, cloud_region=cloud_region, os=self._conf.get("os"), python_version=self._conf.get("python_version"), gpu_count=self._conf.get("gpu_count"), gpu_model=self._conf.get("gpu_model"), cpu_count=self._conf.get("cpu_count"), cpu_model=self._conf.get("cpu_model"), longitude=self._conf.get("longitude"), latitude=self._conf.get("latitude"), ram_total_size=self._conf.get("ram_total_size"), tracking_mode=self._conf.get("tracking_mode"), ) if delta: if self._previous_emissions is None: self._previous_emissions = total_emissions else: # Create a copy delta_emissions = dataclasses.replace(total_emissions) # Compute emissions rate from delta delta_emissions.compute_delta_emission( self._previous_emissions) # TODO : find a way to store _previous_emissions only when # TODO : the API call succeeded self._previous_emissions = total_emissions total_emissions = delta_emissions logger.debug(total_emissions) return total_emissions
api_call_interval=4, api_key="12aaaaaa-0b23-1234-1234-abcdef123456", save_to_api=True, ) def train_model(): """ This function will do nothing during (occurrence * delay) seconds. The Code Carbon API will be called every (measure_power_secs * api_call_interval) seconds. """ occurrence = 60 * 24 * 365 * 100 # Run for 100 years ! delay = 60 # Seconds for i in range(occurrence): print( f"{occurrence * delay - i * delay} seconds before ending script..." ) time.sleep(delay) if __name__ == "__main__": logger.setLevel(logging.DEBUG) # create file handler which logs even debug messages fh = logging.FileHandler("codecarbon.log") fh.setLevel(logging.DEBUG) formatter = logging.Formatter( "%(asctime)s - %(name)-12s: %(levelname)-8s %(message)s") fh.setFormatter(formatter) logger.addHandler(fh) logger.debug("GO!") model = train_model() logger.debug("THE END!")