def test_empty_gpu_card_indices_on_cuda_env_variable_minus_one(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'-1'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([], resource_info.gpu_card_indices)
    def test_gpu_card_indices_without_cuda_env_variable(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 2
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ=dict())

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([0, 1], resource_info.gpu_card_indices)
    def test_cgroup_resource_info(self):
        # given
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=GPUMonitor(), os_environ=os.environ)

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.CGROUP)

        # then
        self.assert_float_greater_than(resource_info.cpu_core_count, 0)
        self.assert_int_greater_than(resource_info.memory_amount_bytes, 0)
        self.assert_int_greater_or_equal(resource_info.gpu_card_count, 0)
        self.assert_int_greater_or_equal(resource_info.gpu_memory_amount_bytes, 0)
    def test_should_ignore_invalid_cuda_env_variable_syntax(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,abc'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([0, 1, 2, 3], resource_info.gpu_card_indices)
    def test_should_ignore_gpu_indices_after_index_out_of_range(self):
        # given
        gpu_monitor = MagicMock(spec_set=GPUMonitor)
        gpu_monitor.get_card_count.return_value = 4
        # and
        system_resource_info_factory = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(), gpu_monitor=gpu_monitor, os_environ={u'CUDA_VISIBLE_DEVICES': u'1,3,5,2'})

        # when
        resource_info = system_resource_info_factory.create(GaugeMode.SYSTEM)

        # then
        self.assertEqual([1, 3], resource_info.gpu_card_indices)
Ejemplo n.º 6
0
    def create(self, gauge_mode, experiment, reference_timestamp):
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=self.__os_environ,
        ).create(gauge_mode=gauge_mode)

        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory,
            system_resource_info=system_resource_info)
        metrics_container = metrics_factory.create_metrics_container()

        for metric in metrics_container.metrics():
            metric.internal_id = self.__backend.create_hardware_metric(
                experiment, metric)

        metric_reporter = MetricReporterFactory(reference_timestamp).create(
            metrics=metrics_container.metrics())

        return MetricService(
            backend=self.__backend,
            metric_reporter=metric_reporter,
            experiment=experiment,
            metrics_container=metrics_container,
        )
Ejemplo n.º 7
0
    def start(self, run: "Run"):
        gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=os.environ,
        ).create(gauge_mode=gauge_mode)
        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory, system_resource_info=system_resource_info
        )
        metrics_container = metrics_factory.create_metrics_container()
        metric_reporter = MetricReporterFactory(time.time()).create(
            metrics=metrics_container.metrics()
        )

        for metric in metrics_container.metrics():
            self._gauges_in_resource[metric.resource_type] = len(metric.gauges)

        for metric in metrics_container.metrics():
            for gauge in metric.gauges:
                path = self.get_attribute_name(metric.resource_type, gauge.name())
                if not run.get_attribute(path):
                    run[path] = FloatSeries(
                        [], min=metric.min_value, max=metric.max_value, unit=metric.unit
                    )

        self._thread = self.ReportingThread(self, self._period, run, metric_reporter)
        self._thread.start()
        self._started = True