def _distributed_worker(local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args): assert torch.cuda.is_available( ), "cuda is not available. Please check your installation." global_rank = machine_rank * num_gpus_per_machine + local_rank try: dist.init_process_group(backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank) except Exception as e: logger = logging.getLogger(__name__) logger.error("Process group URL: {}".format(dist_url)) raise e # synchronize is needed here to prevent a possible timeout after calling init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() assert num_gpus_per_machine <= torch.cuda.device_count() torch.cuda.set_device(local_rank) # Setup the local process group (which contains ranks within the same machine) assert comm._LOCAL_PROCESS_GROUP is None num_machines = world_size // num_gpus_per_machine for i in range(num_machines): ranks_on_i = list( range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) pg = dist.new_group(ranks_on_i) if i == machine_rank: comm._LOCAL_PROCESS_GROUP = pg main_func(*args)
def evaluate_files(self): """ Only evaluate files without inference """ if self._distributed: comm.synchronize() if not comm.is_main_process(): return del self._predictions if self._output_dir: file_path = os.path.join(self._output_dir, "instances_predictions.pth") self._predictions = torch.load(file_path) logger.info("Read predictions from {}".format(file_path)) else: logger.warning( "Stored predictions is None, you need to run the inference_on_dataset" ) raise NotImplementedError self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) self._targets = comm.gather(self._targets, dst=0) self._targets = list(itertools.chain(*self._targets)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning("[ClassificationEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() assert len(self._predictions) == len(self._targets) if self._predictions[0] is not None: self._eval_classification_accuracy() if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: ensure_dir(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with megfile.smart_open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): comm.synchronize() self._predictions = comm.gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return # gt_json = PathManager.get_local_path(self._metadata.panoptic_json) gt_json = self._metadata.panoptic_json gt_folder = self._metadata.panoptic_root with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: logger.info( "Writing all panoptic predictions to {} ...".format(pred_dir)) for p in self._predictions: with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: f.write(p.pop("png_string")) with open(gt_json, "r") as f: json_data = json.load(f) json_data["annotations"] = self._predictions with megfile.smart_open(self._predictions_json, "w") as f: f.write(json.dumps(json_data)) from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( gt_json, self._predictions_json, gt_folder=gt_folder, pred_folder=pred_dir, ) res = {} res["PQ"] = 100 * pq_res["All"]["pq"] res["SQ"] = 100 * pq_res["All"]["sq"] res["RQ"] = 100 * pq_res["All"]["rq"] res["PQ_th"] = 100 * pq_res["Things"]["pq"] res["SQ_th"] = 100 * pq_res["Things"]["sq"] res["RQ_th"] = 100 * pq_res["Things"]["rq"] res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] results = OrderedDict({"panoptic_seg": res}) table = _print_panoptic_results(pq_res) if self._dump: dump_info_one_task = { "task": "panoptic_seg", "tables": [table], } _dump_to_markdown([dump_info_one_task]) return results
def _do_eval(self): results = self._func() if results: assert isinstance( results, dict ), "Eval function must return a dict. Got {} instead.".format(results) flattened_results = flatten_results_dict(results) for k, v in flattened_results.items(): try: v = float(v) except Exception: raise ValueError( "[EvalHook] eval_function should return a nested dict of float. " "Got '{}: {}' instead.".format(k, v) ) self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) # Evaluation may take different time among workers. # A barrier make them start the next iteration together. comm.synchronize()
def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()): """ Args: main_func: a function that will be called by `main_func(*args)` num_machines (int): the total number of machines machine_rank (int): the rank of this machine (one per machine) dist_url (str): url to connect to for distributed training, including protocol e.g. "tcp://127.0.0.1:8686". Can be set to auto to automatically select a free port on localhost args (tuple): arguments passed to main_func """ # set the NCCL related environment variables comm.configure_nccl() world_size = num_machines * num_gpus_per_machine if world_size > 1: # https://github.com/pytorch/pytorch/pull/14391 # TODO prctl in spawned processes if dist_url == "auto": port = _find_free_port() dist_url = f"tcp://127.0.0.1:{port}" mp.spawn( _distributed_worker, nprocs=num_gpus_per_machine, args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args), daemon=False, ) comm.synchronize() else: main_func(*args)
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: extra_infos = { "title": os.path.basename(os.getcwd()), "seed": self.cfg.SEED, } _dump_to_markdown(extra_infos, self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP" and "AP50". """ comm.synchronize() if comm.get_rank() > 0: return os.environ["CITYSCAPES_DATASET"] = os.path.abspath( os.path.join(self._metadata.gt_dir, "..", "..") ) # Load the Cityscapes eval script *after* setting the required env var, # since the script reads CITYSCAPES_DATASET into global variables at load time. import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa groundTruthImgList = glob.glob(cityscapes_eval.args.groundTruthSearch) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args )["averages"] ret = OrderedDict() ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} self._working_dir.cleanup() small_table = create_small_table(ret["segm"]) self._logger.info("Evaluation results for segm: \n" + small_table) results_per_category = [] for cat, ap in results["classes"].items(): ap = [ap_i * 100 for ap_i in ap.values()] results_per_category.append([cat, *ap]) table = tabulate( results_per_category, headers=["category", "AP", "AP50"], tablefmt="pipe", floatfmt=".3f", numalign="left" ) self._logger.info("Per-category segm AP: \n" + table) if self._dump: dump_info_one_task = { "task": "segm", "tables": [small_table, table], } _dump_to_markdown([dump_info_one_task]) return ret
def preprocess_image(self, batched_inputs, training): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] bs = len(images) images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, size_divisibility=0, pad_ref_long=True) # sync image size for all gpus comm.synchronize() if training and self.iter % self.change_iter == 0: if self.iter < self.max_iter - 20000: meg = torch.LongTensor(1).to(self.device) comm.synchronize() if comm.is_main_process(): size = np.random.choice(self.multi_size) meg.fill_(size) if comm.get_world_size() > 1: comm.synchronize() dist.broadcast(meg, 0) self.size = meg.item() comm.synchronize() else: self.size = 608 if training: # resize image inputs modes = ['bilinear', 'nearest', 'bicubic', 'area'] mode = modes[random.randrange(4)] if mode == 'bilinear' or mode == 'bicubic': images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode, align_corners=False) else: images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None targets = [ torch.cat([ instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor ], dim=-1) for instance in gt_instances ] labels = torch.zeros((bs, 100, 5)) for i, target in enumerate(targets): labels[i][:target.shape[0]] = target labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size else: labels = None self.iter += 1 return images, labels
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DefaultCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: comm.synchronize() conf_matrix_list = comm.all_gather(self._conf_matrix) self._predictions = comm.all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.zeros(self._num_classes, dtype=np.float) iou = np.zeros(self._num_classes, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] macc = np.sum(acc) / np.sum(acc_valid) miou = np.sum(iou) / np.sum(iou_valid) fiou = np.sum(iou * class_weights) pacc = np.sum(tp) / np.sum(pos_gt) res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou res["mACC"] = 100 * macc res["pACC"] = 100 * pacc if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) small_table = create_small_table(res) self._logger.info("Evaluation results for sem_seg: \n" + small_table) if self._dump: dump_info_one_task = { "task": "sem_seg", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return results