Esempio n. 1
0
    def _update_and_create_report(
        self,
        batch: Dict,
        batch_idx: int,
        step_output: Dict,
        pl_module: LightningModule,
        combined_report: Report = None,
        update_meter: Meter = None,
    ):
        report = Report(batch, step_output)

        if update_meter:
            update_meter.update_from_report(report)

        should_accumulate = not (
            batch_idx % self.trainer_config.accumulate_grad_batches == 0)

        final_report = report
        if should_accumulate and combined_report is not None:
            combined_report.accumulate_tensor_fields_and_loss(
                report, pl_module.metrics.required_params)
            combined_report.batch_size += report.batch_size
            final_report = combined_report

        return final_report
Esempio n. 2
0
    def test_meter_update_from_report(self):
        meter = Meter()
        prepared_batch = SampleList(
            {"targets": torch.tensor([1, 2, 3, 4]), "dataset_type": "val"}
        )
        for idx in range(5):
            model_output = {
                "scores": torch.tensor([0, 1, 2, 3]),
                "losses": {"loss": float(idx)},
            }
            report = Report(prepared_batch, model_output)
            meter.update_from_report(report)

        self.assertEqual(meter.loss.global_avg, 2.0)
        self.assertEqual(meter.loss.avg, 2.0)
Esempio n. 3
0
    def evaluation_loop(
            self,
            dataset_type: str,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        use_cpu = self.config.evaluation.get("use_cpu", False)
        loaded_batches = 0
        skipped_batches = 0

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()
            while reporter.next_dataset(flush_report=False):
                dataloader = reporter.get_dataloader()
                combined_report = None

                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm)
                for batch in dataloader:
                    # Do not timeout quickly on first batch, as workers might start at
                    # very different times.
                    with CompleteInTimeOrDie(600 if loaded_batches else 3600 *
                                             24):
                        loaded_batches += 1
                        prepared_batch = reporter.prepare_batch(batch)
                        prepared_batch = to_device(prepared_batch, self.device)
                        if not validate_batch_sizes(
                                prepared_batch.get_batch_size()):
                            logger.info(
                                "Skip batch due to uneven batch sizes.")
                            skipped_batches += 1
                            continue
                        model_output = self.model(prepared_batch)
                        report = Report(prepared_batch, model_output)
                        report = report.detach()

                        meter.update_from_report(report)

                        moved_report = report
                        # Move to CPU for metrics calculation later if needed
                        # Explicitly use `non_blocking=False` as this can cause
                        # race conditions in next accumulate
                        if use_cpu:
                            moved_report = report.copy().to("cpu",
                                                            non_blocking=False)

                        # accumulate necessary params for metric calculation
                        if combined_report is None:
                            # make a copy of report since `reporter.add_to_report` will
                            # change some of the report keys later
                            combined_report = moved_report.copy()
                        else:
                            combined_report.accumulate_tensor_fields_and_loss(
                                moved_report, self.metrics.required_params)
                            combined_report.batch_size += moved_report.batch_size

                        # Each node generates a separate copy of predict JSON from the
                        # report, which will be used to evaluate dataset-level metrics
                        # (such as mAP in object detection or CIDEr in image captioning)
                        # Since `reporter.add_to_report` changes report keys,
                        # (e.g scores) do this after
                        # `combined_report.accumulate_tensor_fields_and_loss`
                        if "__prediction_report__" in self.metrics.required_params:
                            # Still need to use original report here on GPU/TPU since
                            # it will be gathered
                            reporter.add_to_report(
                                report,
                                self.model,
                                execute_on_master_only=False)

                        if single_batch is True:
                            break

                logger.info(f"Finished training. Loaded {loaded_batches}")
                logger.info(f" -- skipped {skipped_batches} batches.")

                reporter.postprocess_dataset_report()
                assert (combined_report is not None
                        ), "Please check if your validation set is empty!"
                # add prediction_report is used for set-level metrics
                combined_report.prediction_report = reporter.report

                combined_report.metrics = self.metrics(combined_report,
                                                       combined_report)

                # Since update_meter will reduce the metrics over GPUs, we need to
                # move them back to GPU but we will only move metrics and losses
                # which are needed by update_meter to avoid OOM
                # Furthermore, do it in a non_blocking way to avoid any issues
                # in device to host or host to device transfer
                if use_cpu:
                    combined_report = combined_report.to(
                        self.device,
                        fields=["metrics", "losses"],
                        non_blocking=False)

                meter.update_from_report(combined_report,
                                         should_update_loss=False)

            # enable train mode again
            self.model.train()

        return combined_report, meter
Esempio n. 4
0
    def evaluation_loop(
            self,
            dataset_type: str,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()
        reporter = self.dataset_loader.get_test_reporter(dataset_type)

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()

            while reporter.next_dataset(flush_report=False):
                dataloader = reporter.get_dataloader()
                combined_report = None

                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm)

                for batch in dataloader:
                    prepared_batch = reporter.prepare_batch(batch)
                    prepared_batch = to_device(prepared_batch, self.device)
                    model_output = self.model(prepared_batch)
                    report = Report(prepared_batch, model_output)

                    meter.update_from_report(report)

                    # accumulate necessary params for metric calculation
                    if combined_report is None:
                        # make a copy of report since `reporter.add_to_report` will
                        # change some of the report keys later
                        combined_report = Report(report)
                    else:
                        combined_report.accumulate_tensor_fields_and_loss(
                            report, self.metrics.required_params)
                        combined_report.batch_size += report.batch_size

                    # Each node generates a separate copy of predict JSON from the
                    # report, which will be used to evaluate dataset-level metrics
                    # (such as mAP in object detection or CIDEr in image captioning)
                    # Since `reporter.add_to_report` changes report keys (e.g. scores),
                    # do this after `combined_report.accumulate_tensor_fields_and_loss`
                    if "__prediction_report__" in self.metrics.required_params:
                        reporter.add_to_report(report,
                                               self.model,
                                               execute_on_master_only=False)

                    if single_batch is True:
                        break

                reporter.postprocess_dataset_report()
                assert (combined_report is not None
                        ), "Please check if your validation set is empty!"
                # add prediction_report is used for set-level metrics
                combined_report.prediction_report = reporter.report

                combined_report.metrics = self.metrics(combined_report,
                                                       combined_report)
                meter.update_from_report(combined_report,
                                         should_update_loss=False)

            # enable train mode again
            self.model.train()

        return combined_report, meter