Esempio n. 1
0
    def create(self, gauge_mode, experiment, reference_timestamp):
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=self.__os_environ,
        ).create(gauge_mode=gauge_mode)

        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory,
            system_resource_info=system_resource_info)
        metrics_container = metrics_factory.create_metrics_container()

        for metric in metrics_container.metrics():
            metric.internal_id = self.__backend.create_hardware_metric(
                experiment, metric)

        metric_reporter = MetricReporterFactory(reference_timestamp).create(
            metrics=metrics_container.metrics())

        return MetricService(
            backend=self.__backend,
            metric_reporter=metric_reporter,
            experiment=experiment,
            metrics_container=metrics_container,
        )
Esempio n. 2
0
    def start(self, run: "Run"):
        gauge_mode = GaugeMode.CGROUP if in_docker() else GaugeMode.SYSTEM
        system_resource_info = SystemResourceInfoFactory(
            system_monitor=SystemMonitor(),
            gpu_monitor=GPUMonitor(),
            os_environ=os.environ,
        ).create(gauge_mode=gauge_mode)
        gauge_factory = GaugeFactory(gauge_mode=gauge_mode)
        metrics_factory = MetricsFactory(
            gauge_factory=gauge_factory, system_resource_info=system_resource_info
        )
        metrics_container = metrics_factory.create_metrics_container()
        metric_reporter = MetricReporterFactory(time.time()).create(
            metrics=metrics_container.metrics()
        )

        for metric in metrics_container.metrics():
            self._gauges_in_resource[metric.resource_type] = len(metric.gauges)

        for metric in metrics_container.metrics():
            for gauge in metric.gauges:
                path = self.get_attribute_name(metric.resource_type, gauge.name())
                if not run.get_attribute(path):
                    run[path] = FloatSeries(
                        [], min=metric.min_value, max=metric.max_value, unit=metric.unit
                    )

        self._thread = self.ReportingThread(self, self._period, run, metric_reporter)
        self._thread.start()
        self._started = True
Esempio n. 3
0
    def test_format_fractional_cpu_core_count(self):
        # given
        system_resource_info = SystemResourceInfo(
            cpu_core_count=0.5,
            memory_amount_bytes=2 * BYTES_IN_ONE_GB,
            gpu_card_indices=[],
            gpu_memory_amount_bytes=0,
        )
        # and
        metrics_factory = MetricsFactory(self.gauge_factory,
                                         system_resource_info)

        # when
        metrics_container = metrics_factory.create_metrics_container()

        # then
        self.assertEqual(
            Metric(
                name=u"CPU - usage",
                description=u"average of all cores",
                resource_type=MetricResourceType.CPU,
                unit=u"%",
                min_value=0.0,
                max_value=100.0,
                gauges=[SystemCpuUsageGauge()],
            ),
            metrics_container.cpu_usage_metric,
        )
Esempio n. 4
0
    def test_create_metrics_without_gpu(self):
        # given
        system_resource_info = SystemResourceInfo(
            cpu_core_count=4,
            memory_amount_bytes=16 * BYTES_IN_ONE_GB,
            gpu_card_indices=[],
            gpu_memory_amount_bytes=0,
        )
        # and
        metrics_factory = MetricsFactory(self.gauge_factory,
                                         system_resource_info)

        # when
        metrics_container = metrics_factory.create_metrics_container()

        # then
        self.assertIsNotNone(metrics_container.cpu_usage_metric)
        self.assertIsNotNone(metrics_container.memory_metric)
        self.assertIsNone(metrics_container.gpu_usage_metric)
        self.assertIsNone(metrics_container.gpu_memory_metric)

        # and
        self.assertEqual(
            [
                metrics_container.cpu_usage_metric,
                metrics_container.memory_metric
            ],
            metrics_container.metrics(),
        )
    def test_create_metrics_with_gpu(self):
        # given
        system_resource_info = SystemResourceInfo(
            cpu_core_count=4,
            memory_amount_bytes=16 * BYTES_IN_ONE_GB,
            gpu_card_indices=[0, 1],
            gpu_memory_amount_bytes=8 * BYTES_IN_ONE_GB)
        # and
        metrics_factory = MetricsFactory(self.gauge_factory,
                                         system_resource_info)

        # when
        metrics_container = metrics_factory.create_metrics_container()

        # then
        self.assertEqual(
            Metric(name=u'CPU - usage',
                   description=u'average of all cores',
                   resource_type=MetricResourceType.CPU,
                   unit=u'%',
                   min_value=0.0,
                   max_value=100.0,
                   gauges=[SystemCpuUsageGauge()]),
            metrics_container.cpu_usage_metric)
        # and
        self.assertEqual(
            Metric(name=u'RAM',
                   description=u'',
                   resource_type=MetricResourceType.RAM,
                   unit=u'GB',
                   min_value=0.0,
                   max_value=16.0,
                   gauges=[SystemMemoryUsageGauge()]),
            metrics_container.memory_metric)
        # and
        self.assertEqual(
            Metric(name=u'GPU - usage',
                   description=u'2 cards',
                   resource_type=MetricResourceType.GPU,
                   unit=u'%',
                   min_value=0.0,
                   max_value=100.0,
                   gauges=[
                       GpuUsageGauge(card_index=0),
                       GpuUsageGauge(card_index=1)
                   ]), metrics_container.gpu_usage_metric)
        # and
        self.assertEqual(
            Metric(name=u'GPU - memory',
                   description=u'2 cards',
                   resource_type=MetricResourceType.GPU_RAM,
                   unit=u'GB',
                   min_value=0.0,
                   max_value=8.0,
                   gauges=[
                       GpuMemoryGauge(card_index=0),
                       GpuMemoryGauge(card_index=1)
                   ]), metrics_container.gpu_memory_metric)
    def setUp(self):
        self.maxDiff = 65536

        self.fixture = GaugesFixture()
        self.metrics_container = MetricsFactory(
            gauge_factory=self.fixture.gauge_factory,
            system_resource_info=SystemResourceInfo(
                cpu_core_count=4,
                memory_amount_bytes=64 * BYTES_IN_ONE_GB,
                gpu_card_indices=[0, 2],
                gpu_memory_amount_bytes=32 *
                BYTES_IN_ONE_GB)).create_metrics_container()

        self.reference_timestamp = time.time()
        metric_reporter_factory = MetricReporterFactory(
            reference_timestamp=self.reference_timestamp)
        self.metric_reporter = metric_reporter_factory.create(
            self.metrics_container.metrics())
Esempio n. 7
0
class TestMetricReporterIntegration(unittest.TestCase):
    def setUp(self):
        self.maxDiff = 65536

        self.fixture = GaugesFixture()
        self.metrics_container = MetricsFactory(
            gauge_factory=self.fixture.gauge_factory,
            system_resource_info=SystemResourceInfo(
                cpu_core_count=4,
                memory_amount_bytes=64 * BYTES_IN_ONE_GB,
                gpu_card_indices=[0, 2],
                gpu_memory_amount_bytes=32 * BYTES_IN_ONE_GB,
            ),
        ).create_metrics_container()

        self.reference_timestamp = time.time()
        metric_reporter_factory = MetricReporterFactory(
            reference_timestamp=self.reference_timestamp)
        self.metric_reporter = metric_reporter_factory.create(
            self.metrics_container.metrics())

    def test_report_metrics(self):
        # given
        measurement_timestamp = self.reference_timestamp + 10

        # when
        metric_reports = self.metric_reporter.report(measurement_timestamp)

        # then
        expected_time = measurement_timestamp - self.reference_timestamp
        expected_reports = [
            MetricReport(
                metric=self.metrics_container.cpu_usage_metric,
                values=[
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"cpu",
                        value=self.fixture.cpu_gauge_value,
                    )
                ],
            ),
            MetricReport(
                metric=self.metrics_container.memory_metric,
                values=[
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"ram",
                        value=self.fixture.memory_gauge_value,
                    )
                ],
            ),
            MetricReport(
                metric=self.metrics_container.gpu_usage_metric,
                values=[
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"0",
                        value=self.fixture.gpu0_usage_gauge_value,
                    ),
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"2",
                        value=self.fixture.gpu1_usage_gauge_value,
                    ),
                ],
            ),
            MetricReport(
                metric=self.metrics_container.gpu_memory_metric,
                values=[
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"0",
                        value=self.fixture.gpu0_memory_gauge_value,
                    ),
                    MetricValue(
                        timestamp=measurement_timestamp,
                        running_time=expected_time,
                        gauge_name=u"2",
                        value=self.fixture.gpu1_memory_gauge_value,
                    ),
                ],
            ),
        ]
        self.assertListEqual(expected_reports, metric_reports)