Example #1
0
def train_model(dataset):
    # Export the dataset to COCO format
    export_file, image_dir = export_dataset(dataset)

    # Register it as a COCO dataset in the Detectron2 framework
    try:
        register_coco_instances('my_dataset', {}, export_file, image_dir)
    except:
        print('Dataset was already registered')
    dataset_dicts = load_coco_json(export_file, image_dir)
    MetadataCatalog.get('my_dataset').set(
        thing_classes=[c['name'] for c in dataset.categories])
    segments_metadata = MetadataCatalog.get('my_dataset')
    print(segments_metadata)

    # Configure the training run
    cfg = get_cfg()
    cfg.merge_from_file(
        model_zoo.get_config_file(
            'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml'))
    cfg.DATASETS.TRAIN = ('my_dataset', )
    cfg.DATASETS.TEST = ()
    cfg.INPUT.MASK_FORMAT = 'bitmask'
    cfg.DATALOADER.NUM_WORKERS = 2
    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(
        'COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml'
    )  # Let training initialize from model zoo
    cfg.SOLVER.IMS_PER_BATCH = 4  # 4
    cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
    cfg.SOLVER.MAX_ITER = 6000  # 300 iterations seems good enough for this toy dataset; you may need to train longer for a practical dataset
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512  # faster, and good enough for this toy dataset (default: 512)
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = len(
        dataset.categories)  # number of categories
    #     cfg.MODEL.DEVICE = 'cuda'
    print('Max iter is ', cfg.SOLVER.MAX_ITER)
    # Start the training
    os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    trainer = DefaultTrainer(cfg)
    trainer.resume_or_load(resume=False)
    trainer.train()

    # Return the model
    cfg.MODEL.WEIGHTS = os.path.join(cfg.OUTPUT_DIR, 'model_final.pth')
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7  # set the testing threshold for this model
    cfg.DATASETS.TEST = ('my_dataset', )
    cfg.TEST.DETECTIONS_PER_IMAGE = 1000

    built_model = build_model(cfg)  # returns a torch.nn.Module
    DetectionCheckpointer(built_model).load(
        cfg.MODEL.WEIGHTS)  #capture trained model
    checkpointer = DetectionCheckpointer(
        built_model, save_dir="/content/gdrive/My Drive/Colab Notebooks")
    checkpointer.save("model_final")  # save to output/model_999.pth

    predictor = DefaultPredictor(cfg)
    model = Model(predictor)

    return model
Example #2
0
 def save(self, dst):
     try:
         from detectron2.checkpoint import (
             DetectionCheckpointer, )  # noqa # pylint: disable=unused-import
         from detectron2.config import get_cfg
     except ImportError:
         raise MissingDependencyException(
             "Detectron package is required to use DetectronArtifact")
     os.makedirs(dst, exist_ok=True)
     checkpointer = DetectionCheckpointer(self._model, save_dir=dst)
     checkpointer.save(self._file_name)
     cfg = get_cfg()
     cfg.merge_from_file(self._input_model_yaml)
     with open(os.path.join(dst, f"{self._file_name}.yaml"),
               'w',
               encoding='utf-8') as output_file:
         output_file.write(cfg.dump())
def run_train():
    torch.multiprocessing.freeze_support()

    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file('COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml'))
    # cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.75  # Threshold
    cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_101_FPN_3x/137851257/model_final_f6e8b1.pkl"
    cfg.MODEL.DEVICE = "cpu"  # cpu or cuda

    register_datasets()
    cfg.DATASETS.TRAIN = ('grini_nc_merged_bbox_only_train',)
    cfg.DATASETS.TEST = ('grini_nc_merged_bbox_only_val',)

    # cfg.MODEL.WEIGHTS = get_checkpoint_url('COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml')
    cfg.MODEL.DEVICE = "cpu"  # cpu or cuda

    # todo find out how rescale images and annotations first...
    # Parameters fixed
    cfg.SOLVER.IMS_PER_BATCH = 4
    cfg.SOLVER.BASE_LR = 0.001
    cfg.SOLVER.WARMUP_ITERS = 1000
    cfg.SOLVER.MAX_ITER = 1500  # adjust up if val mAP is still rising, adjust down if overfit
    cfg.SOLVER.STEPS = (1000, 1500)
    cfg.SOLVER.GAMMA = 0.05
    cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 12
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3

    cfg.TEST.EVAL_PERIOD = 500

    # makedirs(cfg.OUTPUT_DIR, exist_ok=True)
    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="my_model")

    # DetectionCheckpointer(cfg).load(file_path_or_url)  # load a file, usually from cfg.MODEL.WEIGHTS

    checkpointer = DetectionCheckpointer(build_model(cfg), save_dir=cfg.OUTPUT_DIR)
    checkpointer.save("model_faster_rcnn_unscaled")  # save to output/model_999.pth
    trainer = CocoTrainer(cfg)
    trainer.resume_or_load(resume=False)
    trainer.train()
def start_train(al_cfg, cfg, model, resume=False):
    early_stopping = EarlyStopping(patience=al_cfg.EARLY_STOP.PATIENCE,
                                   delta=al_cfg.EARLY_STOP.DELTA,
                                   verbose=True)
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            if (cfg.TEST.EVAL_PERIOD > 0
                    and iteration % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter):
                results = do_test(cfg, model)
                bbox_results = results['bbox']
                AP = bbox_results['AP']
                comm.synchronize()
                print('AP:', AP, '\tValue:', 1 - (AP / 100))
                early_stopping(1 - (AP / 100))
                storage.put_scalars(**bbox_results)
                if early_stopping.counter < 1:
                    checkpointer.save('model_final')

            if iteration - start_iter > 5 and (iteration % 20 == 0
                                               or iteration == max_iter):
                for writer in writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

            if early_stopping.early_stop:
                print("EARLY STOPPING INITIATED AT ITERATION:", iteration)
                # checkpointer.save('model_final')
                break
def do_train(cfg, model, cat_heatmap_file, resume=False):
    model.train()

    # select optimizer and learning rate scheduler based on the config
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # creat checkpointer
    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(
        checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter
    )

    # create output writers. Separate TensorBoard writers are created
    # for train and validation sets. This allows easy overlaying of graphs
    # in TensorBoard.
    train_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'train')
    val_tb_writer = os.path.join(cfg.OUTPUT_DIR, 'val')
    train_writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(train_tb_writer),
        ]
        if comm.is_main_process()
        else []
    )
    val_writers = [TensorboardXWriter(val_tb_writer)]


    train_dataset_name = cfg.DATASETS.TRAIN[0]
    train_data_loader = build_detection_train_loader(cfg)
    train_eval_data_loader = build_detection_test_loader(cfg, train_dataset_name)
    val_dataset_name = cfg.DATASETS.TEST[0]
    val_eval_data_loader = build_detection_test_loader(cfg, val_dataset_name, DatasetMapper(cfg,True))
    logger.info("Starting training from iteration {}".format(start_iter))
    train_storage = EventStorage(start_iter)
    val_storage = EventStorage(start_iter)

    # Create the training and validation evaluator objects.
    train_evaluator = get_evaluator(
        cfg, train_dataset_name, os.path.join(cfg.OUTPUT_DIR, "train_inference", train_dataset_name),
        cat_heatmap_file
    )
    val_evaluator = get_evaluator(
        cfg, val_dataset_name, os.path.join(cfg.OUTPUT_DIR, "val_inference", val_dataset_name),
        cat_heatmap_file
    )

    # initialize the best AP50 value
    best_AP50 = 0
    start_time = time.time()
    for train_data, iteration in zip(train_data_loader, range(start_iter, max_iter)):
         # stop if the file stop_running exists in the running directory
         if os.path.isfile('stop_running'):
             os.remove('stop_running')
             break

         iteration = iteration + 1

         # run a step with the training data
         with train_storage as storage:
            model.train()
            storage.step()

            loss_dict = model(train_data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()


            # periodically evaluate the training set and write the results
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter):

                train_eval_results = inference_on_dataset(model, train_eval_data_loader,
                                                          train_evaluator)
                flat_results = flatten_results(train_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in train_writers:
                    writer.write()
            periodic_checkpointer.step(iteration)

         # run a step with the validation set
         with val_storage as storage:
            storage.step()

            # every 20 iterations evaluate the dataset to collect the loss
            if iteration % 20 == 0 or iteration == max_iter:
                with torch.set_grad_enabled(False):
                     for input, i in zip(val_eval_data_loader , range(1)):
                        loss_dict = model(input)
                        losses = sum(loss for loss in loss_dict.values())
                        assert torch.isfinite(losses).all(), loss_dict

                        loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
                        losses_reduced = sum(loss for loss in loss_dict_reduced.values())

                if comm.is_main_process():
                    storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            # periodically evaluate the validation set and write the results
            # check the results against the best results seen and save the parameters for
            # the best result
            if (cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                or iteration == max_iter):
                val_eval_results = inference_on_dataset(model, val_eval_data_loader,
                                                        val_evaluator)
                logger.info('val_eval_results {}', str(val_eval_results))
                results = val_eval_results.get('segm', None)
                if results is None:
                    results = val_eval_results.get('bbox', None)
                if results is not None and results.get('AP50',-1) > best_AP50:
                    best_AP50 = results['AP50']
                    logger.info('saving best results ({}), iter {}'.format(best_AP50, iteration))
                    checkpointer.save("best_AP50")

                flat_results = flatten_results(val_eval_results)
                storage.put_scalars(**flat_results)
                comm.synchronize()

            if iteration - start_iter > 5 and (iteration % 20 == 0):
                for writer in val_writers:
                    writer.write()
                elapsed = time.time() - start_time
                time_per_iter = elapsed / (iteration - start_iter)
                time_left = time_per_iter * (max_iter - iteration)
                logger.info("ETA: {}".format(str(datetime.timedelta(seconds=time_left))))
Example #6
0
class TrainingModule(LightningModule):
    def __init__(self, cfg):
        super().__init__()
        if not logger.isEnabledFor(logging.INFO):  # setup_logger is not called for d2
            setup_logger()
        self.cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size())
        self.storage: EventStorage = None
        self.model = build_model(self.cfg)

        self.start_iter = 0
        self.max_iter = cfg.SOLVER.MAX_ITER

    def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
        checkpoint["iteration"] = self.storage.iter

    def on_load_checkpoint(self, checkpointed_state: Dict[str, Any]) -> None:
        self.start_iter = checkpointed_state["iteration"]
        self.storage.iter = self.start_iter

    def setup(self, stage: str):
        if self.cfg.MODEL.WEIGHTS:
            self.checkpointer = DetectionCheckpointer(
                # Assume you want to save checkpoints together with logs/statistics
                self.model,
                self.cfg.OUTPUT_DIR,
            )
            logger.info(f"Load model weights from checkpoint: {self.cfg.MODEL.WEIGHTS}.")
            # Only load weights, use lightning checkpointing if you want to resume
            self.checkpointer.load(self.cfg.MODEL.WEIGHTS)

        self.iteration_timer = hooks.IterationTimer()
        self.iteration_timer.before_train()
        self.data_start = time.perf_counter()
        self.writers = None

    def training_step(self, batch, batch_idx):
        data_time = time.perf_counter() - self.data_start
        # Need to manually enter/exit since trainer may launch processes
        # This ideally belongs in setup, but setup seems to run before processes are spawned
        if self.storage is None:
            self.storage = EventStorage(0)
            self.storage.__enter__()
            self.iteration_timer.trainer = weakref.proxy(self)
            self.iteration_timer.before_step()
            self.writers = (
                default_writers(self.cfg.OUTPUT_DIR, self.max_iter)
                if comm.is_main_process()
                else {}
            )

        loss_dict = self.model(batch)
        SimpleTrainer.write_metrics(loss_dict, data_time)

        opt = self.optimizers()
        self.storage.put_scalar(
            "lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False
        )
        self.iteration_timer.after_step()
        self.storage.step()
        # A little odd to put before step here, but it's the best way to get a proper timing
        self.iteration_timer.before_step()

        if self.storage.iter % 20 == 0:
            for writer in self.writers:
                writer.write()
        return sum(loss_dict.values())

    def training_step_end(self, training_step_outpus):
        self.data_start = time.perf_counter()
        return training_step_outpus

    def training_epoch_end(self, training_step_outputs):
        self.iteration_timer.after_train()
        if comm.is_main_process():
            self.checkpointer.save("model_final")
        for writer in self.writers:
            writer.write()
            writer.close()
        self.storage.__exit__(None, None, None)

    def _process_dataset_evaluation_results(self) -> OrderedDict:
        results = OrderedDict()
        for idx, dataset_name in enumerate(self.cfg.DATASETS.TEST):
            results[dataset_name] = self._evaluators[idx].evaluate()
            if comm.is_main_process():
                print_csv_format(results[dataset_name])

        if len(results) == 1:
            results = list(results.values())[0]
        return results

    def _reset_dataset_evaluators(self):
        self._evaluators = []
        for dataset_name in self.cfg.DATASETS.TEST:
            evaluator = build_evaluator(self.cfg, dataset_name)
            evaluator.reset()
            self._evaluators.append(evaluator)

    def on_validation_epoch_start(self, _outputs):
        self._reset_dataset_evaluators()

    def validation_epoch_end(self, _outputs):
        results = self._process_dataset_evaluation_results(_outputs)

        flattened_results = flatten_results_dict(results)
        for k, v in flattened_results.items():
            try:
                v = float(v)
            except Exception as e:
                raise ValueError(
                    "[EvalHook] eval_function should return a nested dict of float. "
                    "Got '{}: {}' instead.".format(k, v)
                ) from e
        self.storage.put_scalars(**flattened_results, smoothing_hint=False)

    def validation_step(self, batch, batch_idx: int, dataloader_idx: int = 0) -> None:
        if not isinstance(batch, List):
            batch = [batch]
        outputs = self.model(batch)
        self._evaluators[dataloader_idx].process(batch, outputs)

    def configure_optimizers(self):
        optimizer = build_optimizer(self.cfg, self.model)
        self._best_param_group_id = hooks.LRScheduler.get_best_param_group_id(optimizer)
        scheduler = build_lr_scheduler(self.cfg, optimizer)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    # checkpointer = DetectionCheckpointer(
    #     model, cfg.OUTPUT_DIR,
    #     optimizer=optimizer,
    #     scheduler=scheduler
    # )
    #do not load checkpointer's optimizer and scheduler
    checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)

    #model.load_state_dict(optimizer)

    max_iter = cfg.SOLVER.MAX_ITER

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement
    train_data_loader = build_detection_train_loader(
        cfg, mapper=PathwayDatasetMapper(cfg, True))

    # epoch_data_loader = build_detection_test_loader(cfg=cfg, dataset_name= cfg.DATASETS.TRAIN[0],
    #                                           mapper=PathwayDatasetMapper(cfg, True))

    val_data_loader = build_detection_validation_loader(
        cfg=cfg,
        dataset_name=cfg.DATASETS.TEST[0],
        mapper=PathwayDatasetMapper(cfg, False))

    if cfg.DATALOADER.ASPECT_RATIO_GROUPING:
        epoch_num = (train_data_loader.dataset.sampler._size //
                     cfg.SOLVER.IMS_PER_BATCH) + 1
    else:
        epoch_num = train_data_loader.dataset.sampler._size // cfg.SOLVER.IMS_PER_BATCH

    # periodic_checkpointer = PeriodicCheckpointer(
    #     checkpointer,
    #     #cfg.SOLVER.CHECKPOINT_PERIOD,
    #     epoch_num,
    #     max_iter=max_iter
    # )

    logger.info("Starting training from iteration {}".format(start_iter))
    loss_weights = {'loss_cls': 1, 'loss_box_reg': 1}
    with EventStorage(start_iter) as storage:
        loss_per_epoch = 0.0
        best_loss = 99999.0
        best_val_loss = 99999.0
        better_train = False
        better_val = False
        for data, iteration in zip(train_data_loader,
                                   range(start_iter, max_iter)):
            iteration = iteration + 1
            storage.step()

            loss_dict = model(data)
            losses = sum(loss for loss in loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item() * loss_weights[k]
                for k, v in comm.reduce_dict(loss_dict).items()
            }

            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            #prevent gredient explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            #if comm.is_main_process():
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()

            # if (
            #     # cfg.TEST.EVAL_PERIOD > 0
            #     # and
            #         iteration % epoch_num == 0
            #         #iteration % cfg.TEST.EVAL_PERIOD == 0
            #     and iteration != max_iter
            # ):
            #     do_test(cfg, model)
            #     # Compared to "train_net.py", the test results are not dumped to EventStorage
            #     comm.synchronize()

            loss_per_epoch += losses_reduced
            if iteration % epoch_num == 0 or iteration == max_iter:
                #one complete epoch
                epoch_loss = loss_per_epoch / epoch_num
                #do validation
                #epoch_loss, epoch_cls_loss, epoch_box_reg_loss = do_validation(epoch_data_loader, model, loss_weights)
                #val_loss, val_cls_loss, val_box_reg_loss = do_validation(val_data_loader, model, loss_weights)
                checkpointer.save("model_{:07d}".format(iteration),
                                  **{"iteration": iteration})
                # calculate epoch_loss and push to history cache
                #if comm.is_main_process():
                storage.put_scalar("epoch_loss",
                                   epoch_loss,
                                   smoothing_hint=False)
                # storage.put_scalar("epoch_cls_loss", epoch_cls_loss, smoothing_hint=False)
                # storage.put_scalar("epoch_box_reg_loss", epoch_box_reg_loss, smoothing_hint=False)
                # storage.put_scalar("val_loss", val_loss, smoothing_hint=False)
                # storage.put_scalar("val_cls_loss", val_cls_loss, smoothing_hint=False)
                # storage.put_scalar("val_box_reg_loss", val_box_reg_loss, smoothing_hint=False)

                for writer in writers:
                    writer.write()

                # only save improved checkpoints on epoch_loss
                # if best_loss > epoch_loss:
                #     best_loss = epoch_loss
                #     better_train = True
                # if best_val_loss > val_loss:
                #     best_val_loss = val_loss
                #     better_val = True
                #if better_val:
                #checkpointer.save("model_{:07d}".format(iteration),  **{"iteration": iteration})
                #comm.synchronize()
                #reset loss_per_epoch
                loss_per_epoch = 0.0
                # better_train = False
                # better_val = False
            del loss_dict, losses, losses_reduced, loss_dict_reduced
            torch.cuda.empty_cache()
Example #8
0
def do_train(cfg, model, resume=False):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    checkpointer = DetectionCheckpointer(
        model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler
    )
    
    start_iter = (
        checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
    )
    max_iter = cfg.SOLVER.MAX_ITER

    writers = (
        [
            CommonMetricPrinter(max_iter),
            JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),
        ]
        if comm.is_main_process()
        else []
    )
    min_size = cfg.INPUT.MIN_SIZE_TRAIN 
    max_size = cfg.INPUT.MAX_SIZE_TRAIN, 
    sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
    data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg,
                                                                        is_train=True,
                                                                        augmentations=[
                                                                        T.ResizeShortestEdge(min_size, max_size, sample_style),
                                                                        T.RandomApply(T.RandomFlip(prob = 1, vertical = False), prob = 0.5),
                                                                        T.RandomApply(T.RandomRotation(angle = [180], sample_style = 'choice'), prob = 0.1),
                                                                        T.RandomApply(T.RandomRotation(angle = [-10,10], sample_style = 'range'), prob = 0.9),
                                                                        T.RandomApply(T.RandomBrightness(0.5,1.5), prob = 0.5),
                                                                        T.RandomApply(T.RandomContrast(0.5,1.5), prob = 0.5)                                                             
                                                                        ]))
    best_model_weight = copy.deepcopy(model.state_dict())
    best_val_loss = None
    data_val_loader = build_detection_test_loader(cfg,
                                                  cfg.DATASETS.TEST[0],
                                                  mapper = DatasetMapper(cfg, True))
    logger.info("Starting training from iteration {}".format(start_iter))

    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            iteration += 1
            start = time.time()
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
            scheduler.step()

            if (
                cfg.TEST.EVAL_PERIOD > 0
                and iteration % cfg.TEST.EVAL_PERIOD == 0
                and iteration != max_iter
            ):
                logger.setLevel(logging.CRITICAL)
                print('validating')
                val_total_loss = do_val_monitor(cfg, model, data_val_loader)
                logger.setLevel(logging.DEBUG)
                logger.info(f"validation loss of iteration {iteration}th: {val_total_loss}")
                storage.put_scalar(name = 'val_total_loss', value = val_total_loss)
                
                if best_val_loss is None or val_total_loss < best_val_loss:
                  best_val_loss = val_total_loss
                  best_model_weight = copy.deepcopy(model.state_dict())

                comm.synchronize()
            
            # สร้าง checkpointer เพิ่มให้ save best model โดยดูจาก val loss
            if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter):
                for writer in writers:
                    writer.write()
            
    model.load_state_dict(best_model_weight)
    experiment_name = os.getenv('MLFLOW_EXPERIMENT_NAME')
    checkpointer.save(f'model_{experiment_name}')
    return model
# %%
# Storing the checkpoint
# -----------------------
# Now, we can use the pre-trained backbone from the Detectron2 model. The code
# below shows how to save it as a Detectron2 checkpoint called `my_model.pth`.

# get the first module from the backbone (i.e. the detectron2 ResNet)
# backbone:
#     L ResNet50
#     L SelectStage
#     L AdaptiveAvgPool2d
detmodel.backbone.bottom_up = simclr_backbone[0]

checkpointer = DetectionCheckpointer(detmodel, save_dir='./')
checkpointer.save('my_model')

# %%
# Finetuning with Detectron2
# ---------------------------
#
# The checkpoint from above can now be used by any Detectron2 script. For example,
# you can use the `train_net.py` script in the Detectron2 `tools`:
#
#

# %%
#.. code-block:: none
#
#   python train_net.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
#       MODEL.WEIGHTS path/to/my_model.pth \
def do_train(cfg, model, resume=False, patience=20):
    model.train()
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    scheduler2 = ReduceLROnPlateau(optimizer, mode="max")

    # warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=200)

    checkpointer = DetectionCheckpointer(model,
                                         cfg.OUTPUT_DIR,
                                         optimizer=optimizer,
                                         scheduler=scheduler)
    start_iter = (checkpointer.resume_or_load(
        cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
    max_iter = cfg.SOLVER.MAX_ITER

    periodic_checkpointer = PeriodicCheckpointer(checkpointer,
                                                 cfg.SOLVER.CHECKPOINT_PERIOD,
                                                 max_iter=max_iter)

    writers = ([
        CommonMetricPrinter(max_iter),
        JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
        TensorboardXWriter(cfg.OUTPUT_DIR),
    ] if comm.is_main_process() else [])

    # compared to "train_net.py", we do not support accurate timing and
    # precise BN here, because they are not trivial to implement in a small training loop
    data_loader = build_detection_train_loader(cfg)
    logger.info("Starting training from iteration {}".format(start_iter))
    best_ap50 = 0
    best_iteration = 0
    patience_counter = 0
    with EventStorage(start_iter) as storage:
        for data, iteration in zip(data_loader, range(start_iter, max_iter)):
            storage.step()

            loss_dict = model(data)
            losses = sum(loss_dict.values())
            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {
                k: v.item()
                for k, v in comm.reduce_dict(loss_dict).items()
            }
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced,
                                    **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            storage.put_scalar("lr",
                               optimizer.param_groups[0]["lr"],
                               smoothing_hint=False)
            scheduler.step()
            # warmup_scheduler.dampen(iteration)

            if (cfg.TEST.EVAL_PERIOD > 0
                    and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0
                    and iteration != max_iter - 1):
                test_results = do_test(cfg, model)

                # scheduler2.step(test_results["bbox"]["AP50"])
                # early stopping.

                # save checkpoint to disk

                checkpointer.save(f"model_{iteration}")

                # TODO: restore from best model
                if test_results["bbox"]["AP50"] > best_ap50:
                    best_ap50 = test_results["bbox"]["AP50"]
                    best_iteration = iteration
                    # reset patience counter
                    patience_counter = 0
                    logger.info(f"Patience counter reset.")
                else:
                    patience_counter += 1
                    logger.info(
                        f"Patience counter increased to {patience_counter}, will be terminated at {patience}"
                    )
                    if patience_counter > patience:
                        for writer in writers:
                            writer.write()
                        # restore to best checkpoint

                        checkpointer.load(
                            f"{cfg.OUTPUT_DIR}/model_{best_iteration}.pth")

                        break
                # Compared to "train_net.py", the test results are not dumped to EventStorage
                comm.synchronize()

            if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0
                                               or iteration == max_iter - 1):
                for writer in writers:
                    writer.write()
            # periodic_checkpointer.step(iteration)
        checkpointer.save(f"model_final")
def do_train(cfg, model, resume=False):

    #start the training
    model.train()

    #configuration of the model based on the cfg
    optimizer = build_optimizer(cfg, model)
    scheduler = build_lr_scheduler(cfg, optimizer)

    #chechpoints configuration
    checkpointer = DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler)

    #depending on whether we are using a checkpoint or not the initial iteration
    #would be different
    if resume == False:
        start_iter=1
    else:
        start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)

    #Number of iterations
    max_iter = cfg.SOLVER.MAX_ITER

    #checkpoints configurations
    periodic_checkpointer = PeriodicCheckpointer(checkpointer,cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)
    checkpointer_best= DetectionCheckpointer(model,cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler)
    periodic_checkpointer_best= PeriodicCheckpointer(checkpointer_best, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)

    #writer:
    writers = ([CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")),
            TensorboardXWriter(cfg.OUTPUT_DIR),] if comm.is_main_process() else [])

    #create the dataloader that get information from cfg.training set
    data_loader = build_detection_train_loader(cfg)

    #information about the current situation in the training process
    logger.info("Starting training from iteration {}".format(start_iter))

    #start iteration process (epochs)
    if resume == True:
      print ('Obtaining best val from previous session')
      best_loss=np.loadtxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt")
      print ('Previous best total val loss is %s' %best_loss)

    else:
        best_loss=99999999999999999999999999999999999

    #the patiente list stores the validation losses during the training process
    patience_list=[]
    patience_list.append(best_loss)

    dataset_size=cfg.NUMBER_IMAGES_TRAINING
    print("training set size is %s" %dataset_size)
    iteration_batch_ratio=int(round(float(dataset_size/cfg.SOLVER.IMS_PER_BATCH)))
    print ("%s Minibatches are cosidered as an entire epoch" %iteration_batch_ratio)

    with EventStorage(start_iter) as storage:
        if resume == True:
          iteration=start_iter
        else:
          start_iter=1
          iteration=1

        minibatch=0

        for data, miniepoch in zip(data_loader, range(start_iter*iteration_batch_ratio, max_iter*iteration_batch_ratio)):

            minibatch= minibatch +1
            if minibatch == iteration_batch_ratio:
              minibatch=0
              iteration = iteration + 1


            storage.step()

            loss_dict = model(data)
            #print (loss_dict)
            #print ('SPACE')

            losses = sum(loss for loss in loss_dict.values())
            #print (losses)
            #print ('SPACE')

            assert torch.isfinite(losses).all(), loss_dict

            loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()}
            #print ('SPACE')

            #get the total loss
            losses_reduced = sum(loss for loss in loss_dict_reduced.values())

            if minibatch == 0:
                print ("Minibatch %s / %s" %(minibatch, iteration_batch_ratio))
                print ("iteration %s / %s" %(iteration, max_iter))
                print ('Total losses %s \n' %losses_reduced)
                print (loss_dict_reduced)

            if comm.is_main_process():
                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)

            scheduler.step()

            #Test the validation score of the model
            if (cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter and minibatch ==0 ):

                results, loss_val =do_test(cfg, model)
                patience_list.append(loss_val)
                #Compared to "train_net.py", the test results are not dumped to EventStorage

                if loss_val < best_loss:
                  print ('saving best model')
                  best_loss=loss_val
                  array_loss=np.array([best_loss])

                  #save best model
                  checkpointer_best.save('best_model')
                  np.savetxt(cfg.OUTPUT_DIR+"/"+"best_validation_loss.txt", array_loss, delimiter=',')


                if len(patience_list) > cfg.patience + cfg.warm_up_patience:
                  print('Chenking val losses .......')

                  #Item obtained (patience) iterations ago
                  item_patience=patience_list[-cfg.patience]
                  continue_training=False

                  #Check whether the val loss has improved
                  for i in range(cfg.patience):
                    item_to_check=patience_list[-i]
                    if item_to_check < item_patience:
                      continue_training=True

                  if continue_training == True:
                    print ('The val loss has improved')

                  else:
                    print ('The val loss has not improved. Stopping training')
                    #print the validation losses
                    print (patience_list)

                    #Plot validation loss error evolution
                    plt.plot(range(1,len(patience_list)+1,1),patience_list)
                    plt.xlabel('iterations')
                    plt.ylabel('validation loss')
                    plt.title('Evolution validation loss: \n min val loss: '
                    +str(min(patience_list)))

                    #save the plot
                    plt.savefig(os.path.join(cfg.OUTPUT_DIR,'evolution_val_loss.png'))
                    break


                comm.synchronize()

            # if iteration - start_iter > cfg.TEST.EVAL_PERIOD and (iteration % cfg.TEST.EVAL_PERIOD == 0 or iteration == max_iter):
            #   for writer in writers:
            #     writer.write()

            if minibatch == 1:
              periodic_checkpointer.step(iteration)