def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            self._logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(self._predictions, f)

        self._results = OrderedDict()
        if "instances" in self._predictions[0]:
            self._eval_predictions()
        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
Esempio n. 2
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            predictions = comm.gather(self._predictions, dst=0)
            predictions = list(itertools.chain(*predictions))

            if not comm.is_main_process():
                return {}
        else:
            predictions = self._predictions
        # preictions: list of dict [{'image_id', 'instances'(list of dict [{'image_id', 'category_id', bbox, score}])}]

        if len(predictions) == 0:
            self._logger.warning(
                "[SMDEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(predictions, f)

        self._results = OrderedDict()
        if "proposals" in predictions[0]:
            self._eval_box_proposals(predictions)
        if "instances" in predictions[0]:
            self._eval_predictions(set(self._tasks), predictions)
            self._eval_predictions_others(self._coco_api, predictions)
        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
Esempio n. 3
0
    def after_step(self):
        """Run after every iteration, see parent for details"""
        self.num_steps += 1
        if self.num_steps % self._period == 0:
            data = next(self._loader)

            if torch.cuda.is_available():
                torch.cuda.synchronize()

            with torch.no_grad():
                loss_dict = self.trainer.model(data)

                losses = sum(loss_dict.values())
                assert torch.isfinite(losses).all(), loss_dict

                loss_dict_reduced = {
                    "val_" + k: v.item()
                    for k, v in comm.reduce_dict(loss_dict).items()
                }
                losses_reduced = sum(loss
                                     for loss in loss_dict_reduced.values())
                if comm.is_main_process():
                    self.trainer.storage.put_scalars(
                        total_val_loss=losses_reduced, **loss_dict_reduced)
                comm.synchronize()
        else:
            pass
Esempio n. 4
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            self._logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(self._predictions, f)

        self._results = OrderedDict()
        if "proposals" in self._predictions[0]:
            self._eval_box_proposals()
        if "instances" in self._predictions[0]:
            miou = {'miou': self._eval_predictions(set(self._tasks))}
            return miou
Esempio n. 5
0
def run(args):
    from template_lib.d2.utils import set_ddp_seed
    set_ddp_seed(outdir=f"{global_cfg.tl_outdir}/d2")

    total_batch_size = global_cfg.build_dataloader.batch_size
    num_workers = comm.get_world_size()
    batch_size = total_batch_size // num_workers

    data_loader = build_dataloader(global_cfg.build_dataloader,
                                   kwargs_priority=True,
                                   batch_size=batch_size,
                                   distributed=args.distributed)

    FID_IS_torch = build_GAN_metric(global_cfg.GAN_metric)
    if global_cfg.tl_debug:
        num_images = 50
    else:
        num_images = float('inf')
    FID_IS_torch.calculate_fid_stat_of_dataloader(
        data_loader=data_loader,
        num_images=num_images,
        save_fid_stat=global_cfg.save_fid_stat)

    comm.synchronize()

    pass
Esempio n. 6
0
def setup_after_launch(cfg, output_dir, runner):
    """
    Set things up after entering DDP, including
        - creating working directory
        - setting up logger
        - logging environment
        - initializing runner
    """
    create_dir_on_global_main_process(output_dir)
    comm.synchronize()
    setup_loggers(output_dir)
    cfg.freeze()
    if cfg.OUTPUT_DIR != output_dir:
        with temp_defrost(cfg):
            logger.warning(
                "Override cfg.OUTPUT_DIR ({}) to be the same as output_dir {}".
                format(cfg.OUTPUT_DIR, output_dir))
            cfg.OUTPUT_DIR = output_dir
    logger.info("Initializing runner ...")
    runner = initialize_runner(runner, cfg)

    log_info(cfg, runner)
    dump_cfg(cfg, os.path.join(output_dir, "config.yaml"))

    auto_scale_world_size(cfg, new_world_size=comm.get_world_size())
Esempio n. 7
0
    def _do_eval(self):
        results = self._func()

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(
                results)

            print('Before flatten: ', results)
            flattened_results = flatten_results_dict(results)
            print('After flatten: ', flattened_results)

            for k, v in flattened_results.items():
                try:
                    v = float(v)
                except Exception:
                    raise ValueError(
                        "[EvalHook] eval_function should return a nested dict of float. "
                        "Got '{}: {}' instead.".format(k, v))
            self.trainer.storage.put_scalars(**flattened_results,
                                             smoothing_hint=False)

        # Evaluation may take different time among workers.
        # A barrier make them start the next iteration together.
        comm.synchronize()
Esempio n. 8
0
    def evaluate(self):
        """
        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):

        * Mean intersection-over-union averaged across classes (mIoU)
        * Frequency Weighted IoU (fwIoU)
        * Mean pixel accuracy averaged across classes (mACC)
        * Pixel Accuracy (pACC)
        """
        if self._distributed:
            synchronize()
            conf_matrix_list = all_gather(self._conf_matrix)
            self._predictions = all_gather(self._predictions)
            self._predictions = list(itertools.chain(*self._predictions))
            if not is_main_process():
                return

            self._conf_matrix = np.zeros_like(self._conf_matrix)
            for conf_matrix in conf_matrix_list:
                self._conf_matrix += conf_matrix

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
            with PathManager.open(file_path, "w") as f:
                f.write(json.dumps(self._predictions))

        acc = np.full(self._num_classes, np.nan, dtype=np.float)
        iou = np.full(self._num_classes, np.nan, dtype=np.float)
        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
        class_weights = pos_gt / np.sum(pos_gt)
        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
        acc_valid = pos_gt > 0
        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
        iou_valid = (pos_gt + pos_pred) > 0
        union = pos_gt + pos_pred - tp
        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
        miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
        fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
        pacc = np.sum(tp) / np.sum(pos_gt)

        res = {}
        res["mIoU"] = 100 * miou
        res["fwIoU"] = 100 * fiou
        for i, name in enumerate(self._class_names):
            res["IoU-{}".format(name)] = 100 * iou[i]
        res["mACC"] = 100 * macc
        res["pACC"] = 100 * pacc
        for i, name in enumerate(self._class_names):
            res["ACC-{}".format(name)] = 100 * acc[i]

        if self._output_dir:
            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(res, f)
        results = OrderedDict({"sem_seg": res})
        self._logger.info(results)
        return results
Esempio n. 9
0
    def evaluate(self, img_ids=None):
        """
        Args:
            img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset
        """
        if self._distributed:
            comm.synchronize()
            predictions = comm.gather(self._predictions, dst=0)
            predictions = list(itertools.chain(*predictions))

            if not comm.is_main_process():
                return {}
        else:
            predictions = self._predictions

        if len(predictions) == 0:
            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(predictions, f)

        self._results = OrderedDict()
        if "proposals" in predictions[0]:
            self._eval_box_proposals(predictions)
        if "instances" in predictions[0]:
            self._eval_predictions(predictions, img_ids=img_ids)
        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
Esempio n. 10
0
    def _do_eval(self):
        results = self._func()
        logger = logging.getLogger(__name__)

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(
                results)

            flattened_results = flatten_results_dict(results)
            valid = dict()
            for k, v in flattened_results.items():
                try:
                    valid[k] = float(v)
                # currently only support skipping (List, Tensor, numpy.nda)
                # TODO: Maybe other types of Exceptions need to be taken into consideration
                except (ValueError, TypeError):
                    logger.info("Skip put {}: {} to tensorboard".format(
                        k, type(v)))

            self.trainer.storage.put_scalars(**valid, smoothing_hint=False)

        # Evaluation may take different time among workers.
        # A barrier make them start the next iteration together.
        comm.synchronize()
Esempio n. 11
0
    def _do_eval(self):
        results = self._func()

        if results:
            assert isinstance(
                results, dict
            ), "Eval function must return a dict. Got {} instead.".format(
                results)

            flattened_results = flatten_results_dict(results)
            for k, v in flattened_results.items():
                try:
                    v = float(v)
                except Exception as e:
                    raise ValueError(
                        "[EvalHook] eval_function should return a nested dict of float. "
                        "Got '{}: {}' instead.".format(k, v)) from e
            self.trainer.storage.put_scalars(**flattened_results,
                                             smoothing_hint=False)

        if comm.is_main_process() and results:
            # save evaluation results in json
            is_final = self.trainer.iter + 1 >= self.trainer.max_iter
            os.makedirs(os.path.join(self.cfg.OUTPUT_DIR, 'inference'),
                        exist_ok=True)
            output_file = 'res_final.json' if is_final else \
                'iter_{:07d}.json'.format(self.trainer.iter)
            with PathManager.open(
                    os.path.join(self.cfg.OUTPUT_DIR, 'inference',
                                 output_file), 'w') as fp:
                json.dump(results, fp)

        # Evaluation may take different time among workers.
        # A barrier make them start the next iteration together.
        comm.synchronize()
Esempio n. 12
0
 def run_step(self):
     """
     Implement the standard training logic described above.
     """
     assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
     start = time.perf_counter()
     """
     If you want to do something with the data, you can wrap the dataloader.
     """
     classifier_data = next(self._data_loader_iter)
     base_data = None
     meta_data = None
     data_time = time.perf_counter() - start
     loss_dict = self.model(base_data,
                            weak_batched_inputs=classifier_data,
                            meta_data=meta_data)
     losses = sum(loss_dict.values())
     self._detect_anomaly(losses, loss_dict)
     self.optimizer.zero_grad()
     losses.backward()
     self.optimizer.step()
     comm.synchronize()
     metrics_dict = loss_dict
     metrics_dict["data_time"] = data_time
     self._write_metrics(metrics_dict)
Esempio n. 13
0
    def evaluate(self):
        # Se ejecuta una vez que están todas las predicciones
        if self._distributed:
            comm.synchronize()
            predictions = comm.gather(self._predictions, dst=0)
            predictions = list(itertools.chain(*predictions))

            if not comm.is_main_process():
                return {}
        else:
            predictions = self._predictions

        if len(predictions) == 0:
            self._logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir,
                                     "instances_predictions.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(predictions, f)

        self._results = OrderedDict()
        if "proposals" in predictions[0]:
            self._eval_box_proposals(predictions)
        if "instances" in predictions[0]:
            self._eval_predictions(set(self._tasks), predictions)
        # Copy so the caller can do whatever with results
        return copy.deepcopy(self._results)
def raw_to_detectron(data_path: Path, remove_cache: bool, cfg: CfgNode):
    data_splits = ['val']
    data_splits += ['train'] if not cfg.DEBUG else []
    for name in data_splits:
        coco_path = Path('.') / 'tmp' / ('coco_' + name + '.json')

        if (remove_cache or not coco_path.exists()) and comm.is_main_process():
            input_files = [a for a in (data_path / name / 'inputs').iterdir()]
            mask_ext = next(
                (data_path / name / 'masks').iterdir()).name.split('.')[1]
            mask_files = [
                a.parent.parent / 'masks' /
                (a.name.split('.')[0] + '.' + mask_ext) for a in input_files
            ]
            shutil.rmtree(coco_path, ignore_errors=True)
            coco_path.parent.mkdir(parents=True, exist_ok=True)
            frame_objects = array_apply(process_frame,
                                        zip(input_files, mask_files,
                                            repeat(cfg.MIN_AREA)),
                                        parallel=not DEBUG,
                                        total=len(input_files),
                                        chunksize=1000)
            write_serialized(frame_objects, coco_path)

        DatasetCatalog.register(name, lambda d=coco_path: get_data_dicts(d))
        MetadataCatalog.get(name).set(thing_classes=['object'])
    comm.synchronize()
Esempio n. 15
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            predictions = comm.gather(self._predictions, dst=0)
            predictions = list(itertools.chain(*predictions))

            if not comm.is_main_process():
                return {}
        else:
            predictions = self._predictions

        image_contains_mixed_unknowns = [
            prediction['image_contains_mixed_unknowns']
            for prediction in predictions
        ]
        scores = [prediction['scores'] for prediction in predictions]
        correct = [prediction['correct'] for prediction in predictions]
        pred_classes = [
            prediction['pred_classes'] for prediction in predictions
        ]

        category_counts = {}
        for category in self._coco_api.cats:
            if category not in category_counts:
                category_counts[self.internal_dataset_mapping[category]] = 0
            category_counts[self.internal_dataset_mapping[category]] += len(
                self._coco_api.getAnnIds(catIds=[category]))

        return dict(predictions=dict(
            image_contains_mixed_unknowns=image_contains_mixed_unknowns,
            scores=scores,
            correct=correct,
            pred_classes=pred_classes),
                    category_counts=category_counts)
    def evaluate(self):
        """
        Returns:
            In detectron2.tools.train_net.py, following format expected:
            dict:
                * key: the name of the task (e.g., bbox)
                * value: a dict of {metric name: score}, e.g.: {"AP50": 80}
        """
        if self._distributed:
            comm.synchronize()
            prediction_counts = comm.gather(self.prediction_counts, dst=0)
            prediction_counts = list(itertools.chain(*prediction_counts))
            confidence_scores = comm.gather(self.confidence_scores, dst=0)
            confidence_scores = list(itertools.chain(*confidence_scores))

            if not comm.is_main_process():
                return {}
        else:
            prediction_counts = self.prediction_counts
            confidence_scores = self.confidence_scores

        mpi = np.mean(prediction_counts)
        mcp = np.mean(confidence_scores)
        output_metrics = OrderedDict({
            "false_positives": {
                "predictions_per_image": mpi,
                "confidence_per_prediction": mcp,
            }
        })
        logger.info(f"mean predictions per image: {mpi}")
        logger.info(f"mean confidence per prediction: {mcp}")
        return output_metrics
    def evaluate(self):
        if self._distributed:
            synchronize()
            endpoint_errors = all_gather(self._endpoint_errors)
            endpoint_errors = [per_image for per_gpu in endpoint_errors for per_image in per_gpu]
            self._predictions = all_gather(self._predictions)
            if not is_main_process():
                return

        if self._output_dir:
            PathManager.mkdirs(self._output_dir)
            file_path = os.path.join(self._output_dir, "flow_predictions.json")
            with PathManager.open(file_path, "w") as f:
                f.write(json.dumps(self._predictions))

        ave_epe = sum(endpoint_errors) / len(endpoint_errors)
        res = {"ave_epe": ave_epe}

        if self._output_dir:
            file_path = os.path.join(self._output_dir, "flow_evaluation.pth")
            with PathManager.open(file_path, "wb") as f:
                torch.save(res, f)

        results = OrderedDict({"flow": res})
        small_table = create_small_table(res)
        self._logger.info("Evaluation results for flow: \n" + small_table)
        dump_info_one_task = {
            "task": "flow",
            "tables": [small_table],
        }
        _dump_to_markdown([dump_info_one_task])
        return results
    def get_avg_losses(self, ):

        if self._distributed:
            synchronize()
            self._losses = all_gather(self._losses)

            if not is_main_process():
                return

            all_losses = {}
            for p in self._losses:
                all_losses.update(p)
        else:
            all_losses = self._losses

        image_unique_ids = list(all_losses.keys())

        loss_keys = list(all_losses[image_unique_ids[0]].keys())

        losses_global_avg = {}
        for key in loss_keys:
            losses_global_avg[key] = []

        for img_spec_id in image_unique_ids:
            loss_sig = all_losses[img_spec_id]

            for key in loss_keys:
                losses_global_avg[key].append(loss_sig[key])

        for key in loss_keys:
            losses_global_avg[key] = np.array(losses_global_avg[key]).mean()

        global_loss = OrderedDict(losses_global_avg)
        return global_loss
Esempio n. 19
0
    def after_step(self):
        next_iter = self.trainer.iter + 1
        is_final = next_iter == self.trainer.max_iter
        if is_final or (self._period > 0 and next_iter % self._period == 0):
            results = self._func()

            if results:
                assert isinstance(
                    results, dict
                ), "Eval function must return a dict. Got {} instead.".format(
                    results)

                flattened_results = flatten_results_dict(results)
                for k, v in flattened_results.items():
                    try:
                        v = float(v)
                    except Exception:
                        raise ValueError(
                            "[EvalHook] eval_function should return a nested dict of float. "
                            "Got '{}: {}' instead.".format(k, v))
                self.trainer.storage.put_scalars(**flattened_results,
                                                 smoothing_hint=False)

            # Evaluation may take different time among workers.
            # A barrier make them start the next iteration together.
            comm.synchronize()
    def train_func(self, data, iteration, pbar):
        images, labels = self.preprocess_image(data)
        images = images.tensor

        bs = len(images)

        batched_arcs = get_ddp_attr(self.controller, 'get_sampled_arc')(bs=bs)

        self.gan_model(images=images,
                       labels=labels,
                       z=self.z_train,
                       iteration=iteration,
                       batched_arcs=batched_arcs)

        if iteration % self.train_controller_every_iter == 0:
            get_ddp_attr(self.controller, 'train_controller')(
                G=self.G,
                z=self.z_train,
                y=self.y_train,
                controller=self.controller,
                controller_optim=self.controller_optim,
                iteration=iteration,
                pbar=pbar)

        # Just for monitoring the training processing
        sampled_arc = get_ddp_attr(self.controller, 'get_sampled_arc')()
        sampled_arc = self.get_tensor_of_main_processing(sampled_arc)

        classes_arcs = sampled_arc[[
            0,
        ], ].repeat(self.n_classes, 1)
        self.evaluate_model(classes_arcs=classes_arcs, iteration=iteration)
        comm.synchronize()
Esempio n. 21
0
    def benchmark_distributed(self, num_iter, warmup=10):
        """
        Benchmark the dataloader in each distributed worker, and log results of
        all workers. This helps understand the final performance as well as
        the variances among workers.

        It also prints startup time (first iter) of the dataloader.
        """
        gpu = comm.get_world_size()
        dataset = MapDataset(self.dataset, self.mapper)
        n = self.num_workers
        loader = build_batch_data_loader(dataset,
                                         self.sampler,
                                         self.total_batch_size,
                                         num_workers=n)

        timer = Timer()
        loader = iter(loader)
        next(loader)
        startup_time = timer.seconds()
        logger.info(
            "Dataloader startup time: {:.2f} seconds".format(startup_time))

        comm.synchronize()

        avg, all_times = self._benchmark(loader, num_iter * max(n, 1),
                                         warmup * max(n, 1))
        del loader
        self._log_time(
            f"DataLoader ({gpu} GPUs x {n} workers, total bs={self.total_batch_size})",
            avg,
            all_times,
            True,
        )
    def train_func(self, data, iteration, pbar):

        classes_arcs = self.arcs.repeat(self.n_classes, 1)
        self.evaluate_model(classes_arcs=classes_arcs, iteration=iteration)

        comm.synchronize()
        exit(-1)
Esempio n. 23
0
def _distributed_worker(local_rank, main_func, world_size,
                        num_gpus_per_machine, machine_rank, dist_url, args):
    assert torch.cuda.is_available(
    ), "cuda is not available. Please check your installation."
    global_rank = machine_rank * num_gpus_per_machine + local_rank
    try:
        dist.init_process_group(backend="NCCL",
                                init_method=dist_url,
                                world_size=world_size,
                                rank=global_rank)
    except Exception as e:
        logger = logging.getLogger(__name__)
        logger.error("Process group URL: {}".format(dist_url))
        raise e
    # synchronize is needed here to prevent a possible timeout after calling init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    assert num_gpus_per_machine <= torch.cuda.device_count()
    torch.cuda.set_device(local_rank)

    # Setup the local process group (which contains ranks within the same machine)
    assert comm._LOCAL_PROCESS_GROUP is None
    num_machines = world_size // num_gpus_per_machine
    for i in range(num_machines):
        ranks_on_i = list(
            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
        pg = dist.new_group(ranks_on_i)
        if i == machine_rank:
            comm._LOCAL_PROCESS_GROUP = pg

    main_func(*args)
Esempio n. 24
0
    def _do_loss_eval(self):
        # Copying inference_on_dataset from evaluator.py
        total = len(self._data_loader)
        num_warmup = min(5, total - 1)
            
        start_time = time.perf_counter()
        total_compute_time = 0
        losses = []
        for idx, inputs in enumerate(self._data_loader):            
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0
            start_compute_time = time.perf_counter()
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            total_compute_time += time.perf_counter() - start_compute_time
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start
                eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1)))
                log_every_n_seconds(
                    logging.INFO,
                    "Loss on Validation  done {}/{}. {:.4f} s / img. ETA={}".format(
                        idx + 1, total, seconds_per_img, str(eta)
                    ),
                    n=5,
                )
            loss_batch = self._get_loss(inputs)
            losses.append(loss_batch)
        mean_loss = np.mean(losses)
        self.trainer.storage.put_scalar('validation_loss', mean_loss)
        comm.synchronize()

        return losses
Esempio n. 25
0
    def evaluate(self):
        if self._distributed:
            comm.synchronize()
            self._predictions = comm.gather(self._predictions, dst=0)
            self._predictions = list(itertools.chain(*self._predictions))

            self.submit_results = comm.gather(self.submit_results, dst=0)
            self.submit_results = list(itertools.chain(*self.submit_results))

            if not comm.is_main_process():
                return {}

        if len(self._predictions) == 0:
            self._logger.warning(
                "[COCOEvaluator] Did not receive valid predictions.")
            return {}

        self._logger.info("Preparing results for COCO format ...")
        self._coco_results = list(
            itertools.chain(*[x["instances"] for x in self._predictions]))
        if self._output_dir:
            res_file = os.path.join(self._output_dir,
                                    "crowdhuman_evaluate_results.json")
            self._logger.info("Saving results to {}".format(res_file))
            with PathManager.open(res_file, "w") as f:
                f.write(json.dumps(self._coco_results))
                f.flush()

            self._logger.info("Saving results to {}".format(res_file))
            submit_file = os.path.join(self._output_dir, "submission.txt")
            with PathManager.open(submit_file, "w") as f:
                for result in self.submit_results:
                    f.write(json.dumps(result))
                    f.write("\n")
                f.flush()

        self._logger.info("Evaluating predictions ...")

        metrics = ["ALL"]
        results = {}
        ret_results = OrderedDict()
        for gt_json in [self._metadata.gt_file]:
            name = gt_json.split("/")[-1].split(".")[0]
            for id_setup in range(len(metrics)):
                cocoGt = COCO(gt_json)
                cocoDt = cocoGt.loadRes(res_file)
                imgIds = sorted(cocoGt.getImgIds())
                cocoEval = CrowdHumanEval(cocoGt, cocoDt, "bbox")
                cocoEval.params.imgIds = imgIds
                cocoEval.evaluate(id_setup)
                cocoEval.accumulate()
                performance_dict = cocoEval.summarize(id_setup)
                for key in performance_dict.keys():
                    results[name + " " + key] = performance_dict[key]
        self._logger.info(
            "Evaluation results for Pedestrian Detection on CrowdHuman: \n" +
            create_small_table(results))
        ret_results["PedestrianDetection"] = copy.deepcopy(results)
        return ret_results
Esempio n. 26
0
    def evaluate_loss(self, cfg, model):
        """Compute and log the validation loss to Comet

        Args:
            cfg (CfgNode): Detectron Config Object
            model (torch.nn.Module): Detectron Model

        Returns:
            dict: Empty Dict to satisfy Detectron Eval Hook API requirements
        """
        eval_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0],
                                                  DatasetMapper(cfg, True))

        # Copying inference_on_dataset from evaluator.py
        total = len(eval_loader)
        num_warmup = min(5, total - 1)

        start_time = time.perf_counter()
        total_compute_time = 0
        losses = []

        if comm.is_main_process():
            storage = get_event_storage()

            for idx, inputs in enumerate(eval_loader):
                if idx == num_warmup:
                    start_time = time.perf_counter()
                    total_compute_time = 0
                start_compute_time = time.perf_counter()
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                total_compute_time += time.perf_counter() - start_compute_time
                iters_after_start = idx + 1 - num_warmup * int(
                    idx >= num_warmup)
                seconds_per_img = total_compute_time / iters_after_start
                if idx >= num_warmup * 2 or seconds_per_img > 5:
                    total_seconds_per_img = (time.perf_counter() -
                                             start_time) / iters_after_start
                    eta = datetime.timedelta(
                        seconds=int(total_seconds_per_img * (total - idx - 1)))
                    log_every_n_seconds(
                        logging.INFO,
                        "Loss on Validation  done {}/{}. {:.4f} s / img. ETA={}"
                        .format(idx + 1, total, seconds_per_img, str(eta)),
                        n=5,
                    )
                loss_batch = self._get_loss(model, inputs)
                losses.append(loss_batch)
            mean_loss = np.mean(losses)

            # Log to Comet
            self.experiment.log_metric("eval_loss", mean_loss)

            storage.put_scalar("eval_loss", mean_loss)
            comm.synchronize()

        # Returns empty dict to satisfy Dectron Eval Hook requirement
        return {}
Esempio n. 27
0
    def _do_loss_eval(self) -> float:
        """
        Evaluate the loss function on the validation set.

        Returns:
            mean_loss (float):  Value of the loss.
        """
        # Copying inference_on_dataset from evaluator.py
        num_samples: int = len(self._data_loader)
        self._logger.info("Starting validation on %d samples",
                          num_samples)
        num_warmup: int = min(5, num_samples - 1)

        start_time: float = time.perf_counter()
        total_compute_time: float = 0
        losses: List[float] = []
        for idx, inputs in enumerate(self._data_loader):
            if idx == num_warmup:
                start_time = time.perf_counter()
                total_compute_time = 0

            # Inference for these inputs
            start_compute_time: float = time.perf_counter()
            loss_batch: float = self._get_loss(inputs)
            losses.append(loss_batch)
            if torch.cuda.is_available():
                torch.cuda.synchronize()

            total_compute_time += time.perf_counter() - start_compute_time
            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)

            seconds_per_img = total_compute_time / iters_after_start
            if idx >= num_warmup * 2 or seconds_per_img > 5:
                # Compute average time spent on each image.
                total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start

                # Compute ETA
                eta = datetime.timedelta(
                    seconds=int(total_seconds_per_img * (num_samples - idx - 1)))

                log_every_n_seconds(lvl=logging.INFO,
                                    msg=f"Loss on Validation done {idx + 1}/{num_samples}."\
                                        f" {seconds_per_img:.4f} s / img. ETA={eta}",
                                    n=100,
                                    name=__name__)

        # Average the losses.
        mean_loss = np.mean(losses)

        # Print the loss value.
        self._logger.info("Validation loss : {mean_loss}")

        # Store the loss value for it to be logged and displayed in TensorBoard.
        self.trainer.storage.put_scalar('validation_loss',
                                        mean_loss)
        comm.synchronize()

        return mean_loss
Esempio n. 28
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else []

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.iter = iteration

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter - 1
            ):
                do_test(cfg, model)
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and (
                (iteration + 1) % 20 == 0 or iteration == max_iter - 1
            ):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)
Esempio n. 29
0
    def evaluate(self):
        if self._distributed:
            synchronize()
            self._predictions = all_gather(self._predictions)
            self._predictions = list(itertools.chain(*self._predictions))
            if not is_main_process():
                return

        return copy.deepcopy(self._eval_predictions())
Esempio n. 30
0
def distributed_worker(local_rank, main_func, nprocs, dist_url, args):
    dist.init_process_group(backend="gloo",
                            init_method=dist_url,
                            world_size=nprocs,
                            rank=local_rank)
    comm.synchronize()
    assert comm._LOCAL_PROCESS_GROUP is None
    pg = dist.new_group(list(range(nprocs)))
    comm._LOCAL_PROCESS_GROUP = pg
    main_func(*args)