def __call__(self, sample_list, *args, **kwargs):
        # Move to proper device i.e. same as the model before passing
        model_device = next(self.parameters()).device
        sample_list = to_device(sample_list, model_device)

        model_output = super().__call__(sample_list, *args, **kwargs)

        # Don't do anything fancy to output if it is pretrained
        if self.is_pretrained:
            return model_output

        # Make sure theat the output from the model is a Mapping
        assert isinstance(
            model_output, collections.abc.Mapping
        ), "A dict must be returned from the forward of the model."

        if "losses" in model_output:
            if not self._logged_warning["losses_present"]:
                warnings.warn("'losses' already present in model output. "
                              "No calculation will be done in base model.")
                self._logged_warning["losses_present"] = True

            assert isinstance(
                model_output["losses"],
                collections.abc.Mapping), "'losses' must be a dict."
        elif hasattr(self, "losses"):
            model_output["losses"] = self.losses(sample_list, model_output)
        else:
            model_output["losses"] = {}

        return model_output
Exemple #2
0
    def prediction_loop(self, dataset_type: str) -> None:
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        skipped_batches = 0
        loaded_batches = 0
        with torch.no_grad():
            self.model.eval()
            logger.info(f"Starting {dataset_type} inference predictions")

            while reporter.next_dataset():
                dataloader = reporter.get_dataloader()
                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader)
                for batch in dataloader:
                    with CompleteInTimeOrDie(600):
                        prepared_batch = reporter.prepare_batch(batch)
                        prepared_batch = to_device(prepared_batch, self.device)
                        loaded_batches += 1
                        if not validate_batch_sizes(prepared_batch.get_batch_size()):
                            logger.info("Skip batch due to unequal batch sizes.")
                            skipped_batches += 1
                            continue
                        with torch.cuda.amp.autocast(enabled=self.training_config.fp16):
                            model_output = self.model(prepared_batch)
                        report = Report(prepared_batch, model_output)
                        reporter.add_to_report(report, self.model)

                reporter.postprocess_dataset_report()

            logger.info(f"Finished predicting. Loaded {loaded_batches}")
            logger.info(f" -- skipped {skipped_batches} batches.")
            self.model.train()
Exemple #3
0
    def evaluation_loop(
            self,
            dataset_type: str,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()
        reporter = self.dataset_loader.get_test_reporter(dataset_type)

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()

            while reporter.next_dataset(flush_report=False):
                dataloader = reporter.get_dataloader()

                combined_report = None
                for batch in tqdm.tqdm(dataloader, disable=disable_tqdm):
                    prepared_batch = reporter.prepare_batch(batch)
                    prepared_batch = to_device(prepared_batch, self.device)
                    model_output = self.model(prepared_batch)
                    report = Report(prepared_batch, model_output)

                    self.update_meter(report, meter)

                    # accumulate necessary params for metric calculation
                    if combined_report is None:
                        # make a copy of report since `reporter.add_to_report` will
                        # change some of the report keys later
                        combined_report = Report(report)
                    else:
                        combined_report.accumulate_tensor_fields_and_loss(
                            report, self.metrics.required_params)
                        combined_report.batch_size += report.batch_size

                    # Each node generates a separate copy of predict JSON from the report,
                    # which will be used to evaluate dataset-level metrics
                    # (such as mAP in object detection or CIDEr in image captioning)
                    # Since `reporter.add_to_report` changes report keys (e.g. scores),
                    # do this after `combined_report.accumulate_tensor_fields_and_loss`
                    if "__prediction_report__" in self.metrics.required_params:
                        reporter.add_to_report(report,
                                               self.model,
                                               execute_on_master_only=False)

                    if single_batch is True:
                        break

                reporter.postprocess_dataset_report()
                # add prediction_report is used for set-level metrics
                combined_report.prediction_report = reporter.report

                combined_report.metrics = self.metrics(combined_report,
                                                       combined_report)
                self.update_meter(combined_report, meter, eval_mode=True)

            # enable train mode again
            self.model.train()

        return combined_report, meter
Exemple #4
0
    def _forward(self, batch: Tensor) -> Dict[str, Any]:
        prepared_batch = self.dataset_loader.prepare_batch(batch)
        # Move the sample list to device if it isn't as of now.
        prepared_batch = to_device(prepared_batch, torch.device("cuda"))
        self.profile("Batch prepare time")
        # Arguments should be a dict at this point
        model_output = self.model(prepared_batch)
        report = Report(prepared_batch, model_output)
        self.profile("Forward time")

        return report
Exemple #5
0
    def _forward(self, batch: Dict[str, Tensor]) -> Dict[str, Any]:
        # Move the sample list to device if it isn't as of now.
        prepared_batch = to_device(batch, self.device)
        self.profile("Batch prepare time")
        # Arguments should be a dict at this point

        with torch.cuda.amp.autocast(enabled=self.training_config.fp16):
            model_output = self.model(prepared_batch)
            report = Report(prepared_batch, model_output)

        self.profile("Forward time")
        return report
Exemple #6
0
    def test_to_device(self):
        sample_list = test_utils.build_random_sample_list()

        modified = to_device(sample_list, "cpu")
        self.assertEqual(modified.get_device(), torch.device("cpu"))

        modified = to_device(sample_list, torch.device("cpu"))
        self.assertEqual(modified.get_device(), torch.device("cpu"))

        modified = to_device(sample_list, "cuda")

        if torch.cuda.is_available():
            self.assertEqual(modified.get_device(), torch.device("cuda:0"))
        else:
            self.assertEqual(modified.get_device(), torch.device("cpu"))

        double_modified = to_device(modified, modified.get_device())
        self.assertTrue(double_modified is modified)

        custom_batch = [{"a": 1}]
        self.assertEqual(to_device(custom_batch), custom_batch)
Exemple #7
0
    def prediction_loop(self, dataset_type: str) -> None:
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        with torch.no_grad():
            self.model.eval()
            logger.info(f"Starting {dataset_type} inference predictions")

            while reporter.next_dataset():
                dataloader = reporter.get_dataloader()

                for batch in tqdm.tqdm(dataloader):
                    prepared_batch = reporter.prepare_batch(batch)
                    prepared_batch = to_device(prepared_batch,
                                               torch.device("cuda"))
                    model_output = self.model(prepared_batch)
                    report = Report(prepared_batch, model_output)
                    reporter.add_to_report(report, self.model)

            logger.info("Finished predicting")
            self.model.train()
Exemple #8
0
    def prediction_loop(self, dataset_type: str) -> None:
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        with torch.no_grad():
            self.model.eval()
            logger.info(f"Starting {dataset_type} inference predictions")

            while reporter.next_dataset():
                dataloader = reporter.get_dataloader()
                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader)

                for batch in dataloader:
                    prepared_batch = reporter.prepare_batch(batch)
                    prepared_batch = to_device(prepared_batch, self.device)
                    with torch.cuda.amp.autocast(
                            enabled=self.training_config.fp16):
                        model_output = self.model(prepared_batch)
                    report = Report(prepared_batch, model_output)
                    reporter.add_to_report(report, self.model)

                reporter.postprocess_dataset_report()

            logger.info("Finished predicting")
            self.model.train()
Exemple #9
0
    def evaluation_loop(
            self,
            dataset_type: str,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        use_cpu = self.config.evaluation.get("use_cpu", False)
        loaded_batches = 0
        skipped_batches = 0

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_master()
            while reporter.next_dataset(flush_report=False):
                dataloader = reporter.get_dataloader()
                combined_report = None

                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm)
                for batch in dataloader:
                    # Do not timeout quickly on first batch, as workers might start at
                    # very different times.
                    with CompleteInTimeOrDie(600 if loaded_batches else 3600 *
                                             24):
                        loaded_batches += 1
                        prepared_batch = reporter.prepare_batch(batch)
                        prepared_batch = to_device(prepared_batch, self.device)
                        if not validate_batch_sizes(
                                prepared_batch.get_batch_size()):
                            logger.info(
                                "Skip batch due to uneven batch sizes.")
                            skipped_batches += 1
                            continue
                        model_output = self.model(prepared_batch)
                        report = Report(prepared_batch, model_output)
                        report = report.detach()

                        meter.update_from_report(report)

                        moved_report = report
                        # Move to CPU for metrics calculation later if needed
                        # Explicitly use `non_blocking=False` as this can cause
                        # race conditions in next accumulate
                        if use_cpu:
                            moved_report = report.copy().to("cpu",
                                                            non_blocking=False)

                        # accumulate necessary params for metric calculation
                        if combined_report is None:
                            # make a copy of report since `reporter.add_to_report` will
                            # change some of the report keys later
                            combined_report = moved_report.copy()
                        else:
                            combined_report.accumulate_tensor_fields_and_loss(
                                moved_report, self.metrics.required_params)
                            combined_report.batch_size += moved_report.batch_size

                        # Each node generates a separate copy of predict JSON from the
                        # report, which will be used to evaluate dataset-level metrics
                        # (such as mAP in object detection or CIDEr in image captioning)
                        # Since `reporter.add_to_report` changes report keys,
                        # (e.g scores) do this after
                        # `combined_report.accumulate_tensor_fields_and_loss`
                        if "__prediction_report__" in self.metrics.required_params:
                            # Still need to use original report here on GPU/TPU since
                            # it will be gathered
                            reporter.add_to_report(
                                report,
                                self.model,
                                execute_on_master_only=False)

                        if single_batch is True:
                            break

                logger.info(f"Finished training. Loaded {loaded_batches}")
                logger.info(f" -- skipped {skipped_batches} batches.")

                reporter.postprocess_dataset_report()
                assert (combined_report is not None
                        ), "Please check if your validation set is empty!"
                # add prediction_report is used for set-level metrics
                combined_report.prediction_report = reporter.report

                combined_report.metrics = self.metrics(combined_report,
                                                       combined_report)

                # Since update_meter will reduce the metrics over GPUs, we need to
                # move them back to GPU but we will only move metrics and losses
                # which are needed by update_meter to avoid OOM
                # Furthermore, do it in a non_blocking way to avoid any issues
                # in device to host or host to device transfer
                if use_cpu:
                    combined_report = combined_report.to(
                        self.device,
                        fields=["metrics", "losses"],
                        non_blocking=False)

                meter.update_from_report(combined_report,
                                         should_update_loss=False)

            # enable train mode again
            self.model.train()

        return combined_report, meter