Ejemplo n.º 1
0
    def compute(self) -> Any:
        """
        Returns:
            Confusion matrix of K rows and K columns, where rows corresponds
            to ground-truth targets and columns corresponds to predicted
            targets.
        """
        # ddp hotfix, could be done better
        # but metric must handle DDP on it's own
        if self._ddp_backend == "xla":
            # if you have "RuntimeError: Aborted: Session XXX is not found" here
            # please, ask Google for a more powerful TPU setup ;)
            device = get_device()
            value = torch.tensor([self.conf], device=device)
            self.conf = xm.all_gather(value).sum(0).cpu().detach().numpy()
        elif self._ddp_backend == "ddp":
            value: List[np.ndarray] = all_gather(self.conf)
            value: np.ndarray = np.sum(np.stack(value, axis=0), axis=0)
            self.conf = value

        if self.normalized:
            conf = self.conf.astype(np.float32)
            return conf / conf.sum(1).clip(min=1e-12)[:, None]
        else:
            return self.conf
Ejemplo n.º 2
0
    def compute(self) -> Tuple[torch.Tensor, float, float, float]:
        """Computes the AUC metric based on saved statistics."""
        targets = torch.cat(self.targets)
        scores = torch.cat(self.scores)

        # ddp hotfix, could be done better
        # but metric must handle DDP on it's own
        if self._ddp_backend == "xla":
            # if you have "RuntimeError: Aborted: Session XXX is not found" here
            # please, ask Google for a more powerful TPU setup ;)
            device = get_device()
            scores = xm.all_gather(scores.to(device)).cpu().detach()
            targets = xm.all_gather(targets.to(device)).cpu().detach()
        elif self._ddp_backend == "ddp":
            scores = torch.cat(all_gather(scores))
            targets = torch.cat(all_gather(targets))

        scores, targets, _, _ = process_multilabel_components(
            outputs=scores, targets=targets
        )
        per_class = auc(scores=scores, targets=targets)
        micro = binary_auc(scores=scores.view(-1), targets=targets.view(-1))[0]
        macro = per_class.mean().item()
        weights = targets.sum(axis=0) / len(targets)
        weighted = (per_class * weights).sum().item()
        if self.compute_per_class_metrics:
            return per_class, micro, macro, weighted
        else:
            return [], micro, macro, weighted
Ejemplo n.º 3
0
    def __init__(self, config: dict):
        """
        Args:
            config (dict):

        Attributes:
            config-related:
                config (dict):
                io_params (dict):
                    in_dir (key: str): path to the data folder
                    test_size (key: float): split size for test
                    split_seed (key: int): seed
                    batch_size (key: int): <-
                    num_workers (key: int): # of workers for data loaders
            split_dict (dict): test_ids
            test_dset (torch.data.Dataset): <-
            loaders (dict): train/validation loaders
            model (torch.nn.Module): <-
        """
        # for reuse
        self.config = config
        self.io_params = config["io_params"]
        # initializing the experiment components
        self.case_list = self.setup_im_ids()
        test_ids = self.get_split()[-1] if config["with_masks"] else self.case_list
        print(f"Inferring on {len(test_ids)} test cases")
        self.test_dset = self.get_datasets(test_ids)
        self.loaders = self.get_loaders()
        self.model = self.get_model().to(get_device())
        self.load_weights()
        print(f"Device: {get_device()}")
Ejemplo n.º 4
0
    def compute(self) -> Any:
        """
        Compute precision, recall, f1 score and support.
        Compute micro, macro and weighted average for the metrics.

        Returns:
            list of aggregated metrics: per-class, micro, macro and weighted averaging of
                precision, recall, f1 score and support metrics
        """
        # ddp hotfix, could be done better
        # but metric must handle DDP on it's own
        if self._ddp_backend == "xla":
            device = get_device()
            for key in self.statistics:
                key_statistics = torch.tensor([self.statistics[key]],
                                              device=device)
                key_statistics = xm.all_gather(key_statistics).sum(
                    dim=0).cpu().numpy()
                self.statistics[key] = key_statistics
        elif self._ddp_backend == "ddp":
            for key in self.statistics:
                value: List[np.ndarray] = all_gather(self.statistics[key])
                value: np.ndarray = np.sum(np.vstack(value), axis=0)
                self.statistics[key] = value

        per_class, micro, macro, weighted = get_aggregated_metrics(
            tp=self.statistics["tp"],
            fp=self.statistics["fp"],
            fn=self.statistics["fn"],
            support=self.statistics["support"],
            zero_division=self.zero_division,
        )
        return per_class, micro, macro, weighted
Ejemplo n.º 5
0
    def __init__(
            self,
            agent: Union[ActorSpec, CriticSpec],
            env: EnvironmentSpec,
            db_server: DBSpec = None,
            exploration_handler: ExplorationHandler = None,
            logdir: str = None,
            id: int = 0,
            mode: str = "infer",  # train/valid/infer
            deterministic: bool = None,
            weights_sync_period: int = 1,
            weights_sync_mode: str = None,
            sampler_seed: int = 42,
            trajectory_seeds: List = None,
            trajectory_limit: int = None,
            force_store: bool = False,
            gc_period: int = 10,
            monitoring_params: Dict = None,
            **kwargs):
        self._device = utils.get_device()
        self._sampler_id = id

        self._deterministic = deterministic \
            if deterministic is not None \
            else mode in ["valid", "infer"]
        self.trajectory_seeds = trajectory_seeds
        self._seeder = tools.Seeder(init_seed=sampler_seed)

        # logging
        self._prepare_logger(logdir, mode)
        self._sampling_flag = mp.Value(c_bool, False)
        self._training_flag = mp.Value(c_bool, True)

        # environment, model, exploration & action handlers
        self.env = env
        self.agent = agent
        self.exploration_handler = exploration_handler
        self.trajectory_index = 0
        self.trajectory_sampler = TrajectorySampler(
            env=self.env,
            agent=self.agent,
            device=self._device,
            deterministic=self._deterministic,
            sampling_flag=self._sampling_flag)

        # synchronization configuration
        self.db_server = db_server
        self._weights_sync_period = weights_sync_period
        self._weights_sync_mode = weights_sync_mode
        self._trajectory_limit = trajectory_limit or np.iinfo(np.int32).max
        self._force_store = force_store
        self._gc_period = gc_period
        self._db_loop_thread = None
        self.checkpoint = None

        #  special
        self.monitoring_params = monitoring_params
        self._init(**kwargs)
Ejemplo n.º 6
0
def main(args, unknown_args):
    args, config = parse_args_uargs(args, unknown_args)
    set_global_seed(args.seed)
    prepare_cudnn(args.deterministic, args.benchmark)

    if args.logdir is not None:
        os.makedirs(args.logdir, exist_ok=True)
        dump_environment(config, args.logdir, args.configs)

    if args.expdir is not None:
        module = import_module(expdir=args.expdir)  # noqa: F841
        if args.logdir is not None:
            dump_code(args.expdir, args.logdir)

    env = ENVIRONMENTS.get_from_params(**config["environment"])

    algorithm_name = config["algorithm"].pop("algorithm")
    if algorithm_name in OFFPOLICY_ALGORITHMS_NAMES:
        ALGORITHMS = OFFPOLICY_ALGORITHMS
        trainer_fn = OffpolicyTrainer
        sync_epoch = False
    elif algorithm_name in ONPOLICY_ALGORITHMS_NAMES:
        ALGORITHMS = ONPOLICY_ALGORITHMS
        trainer_fn = OnpolicyTrainer
        sync_epoch = True
    else:
        # @TODO: add registry for algorithms, trainers, samplers
        raise NotImplementedError()

    db_server = DATABASES.get_from_params(
        **config.get("db", {}), sync_epoch=sync_epoch
    )

    algorithm_fn = ALGORITHMS.get(algorithm_name)
    algorithm = algorithm_fn.prepare_for_trainer(env_spec=env, config=config)

    if args.resume is not None:
        checkpoint = utils.load_checkpoint(filepath=args.resume)
        checkpoint = utils.any2device(checkpoint, utils.get_device())
        algorithm.unpack_checkpoint(
            checkpoint=checkpoint,
            with_optimizer=False
        )

    monitoring_params = config.get("monitoring_params", None)

    trainer = trainer_fn(
        algorithm=algorithm,
        env_spec=env,
        db_server=db_server,
        logdir=args.logdir,
        monitoring_params=monitoring_params,
        **config["trainer"],
    )

    trainer.run()
Ejemplo n.º 7
0
 def __init__(self, path: Union[str, Path], inputs: torch.Tensor):
     """
     Args:
         path (Union[str, Path]): Path to traced model.
         inputs: Input samples.
     """
     super().__init__(CallbackOrder.external)
     self.path: Path = Path(path)
     self.inputs: torch.Tensor = inputs
     self.device = get_device()
Ejemplo n.º 8
0
    def __init__(
        self,
        agent: Union[ActorSpec, CriticSpec],
        env: EnvironmentSpec,
        db_server: DBSpec = None,
        exploration_handler: ExplorationHandler = None,
        logdir: str = None,
        id: int = 0,
        mode: str = "infer",  # train/valid/infer
        weights_sync_period: int = 1,
        weights_sync_mode: str = None,
        seeds: List = None,
        trajectory_limit: int = None,
        force_store: bool = False,
        gc_period: int = 10,
    ):
        self._device = utils.get_device()
        self._sampler_id = id

        self._infer = mode == "infer"
        self.seeds = seeds
        self._seeder = Seeder(
            init_seed=42 + id,
            max_seed=len(seeds) if seeds is not None else None)

        # logging
        self._prepare_logger(logdir, mode)
        self._sample_flag = mp.Value(c_bool, False)

        # environment, model, exploration & action handlers
        self.env = env
        self.agent = agent
        self.exploration_handler = exploration_handler
        self.trajectory_index = 0
        self.trajectory_sampler = TrajectorySampler(
            env=self.env,
            agent=self.agent,
            device=self._device,
            deterministic=self._infer,
            sample_flag=self._sample_flag)

        # synchronization configuration
        self.db_server = db_server
        self._weights_sync_period = weights_sync_period
        self._weights_sync_mode = weights_sync_mode
        self._trajectory_limit = trajectory_limit or np.iinfo(np.int32).max
        self._force_store = force_store
        self._gc_period = gc_period
        self._db_loop_thread = None
Ejemplo n.º 9
0
    def compute(self):
        """
        Compute metrics with accumulated statistics

        Returns:
            tuple of metrics: per_class, micro_metric, macro_metric,
                weighted_metric(None if weights is None)
        """
        per_class = []
        total_statistics = {}
        macro_metric = 0
        weighted_metric = 0
        # ddp hotfix, could be done better
        # but metric must handle DDP on it's own
        # TODO: optimise speed
        if self._ddp_backend == "xla":
            device = get_device()
            for _, statistics in self.statistics.items():
                for key in statistics:
                    value = torch.tensor([statistics[key]], device=device)
                    statistics[key] = xm.all_gather(value).sum(dim=0)
        elif self._ddp_backend == "ddp":
            for _, statistics in self.statistics.items():
                for key in statistics:
                    value: List[torch.Tensor] = all_gather(statistics[key])
                    value: torch.Tensor = torch.sum(torch.vstack(value), dim=0)
                    statistics[key] = value

        for class_idx, statistics in self.statistics.items():
            value = self.metric_fn(**statistics)
            per_class.append(value)
            macro_metric += value
            if self.weights is not None:
                weighted_metric += value * self.weights[class_idx]
            for stats_name, value in statistics.items():
                total_statistics[stats_name] = (
                    total_statistics.get(stats_name, 0) + value)

        macro_metric /= len(self.statistics)
        micro_metric = self.metric_fn(**total_statistics)

        if self.weights is None:
            weighted_metric = None
        if self.compute_per_class_metrics:
            return per_class, micro_metric, macro_metric, weighted_metric
        else:
            return [], micro_metric, macro_metric, weighted_metric
Ejemplo n.º 10
0
 def get_criterion(self):
     """
     Fetches the criterion. (Only one loss.)
     """
     loss_name = self.criterion_params["loss"]
     loss_kwargs = self.criterion_params[loss_name]
     if "weight" in list(loss_kwargs.keys()):
         if isinstance(loss_kwargs["weight"], list):
             weight_tensor = torch.tensor(loss_kwargs["weight"])
             weight_tensor = any2device(weight_tensor, get_device())
             print(f"Converted the `weight` argument in {loss_name}",
                   f" to a {weight_tensor.type()}...")
             loss_kwargs["weight"] = weight_tensor
     loss_cls = globals()[loss_name]
     loss = loss_cls(**loss_kwargs)
     print(f"Criterion: {loss}")
     return loss
Ejemplo n.º 11
0
def main(args, _=None):
    """Run the ``catalyst-data image2embeddings`` script."""
    global IMG_SIZE

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    IMG_SIZE = (args.img_size, args.img_size)  # noqa: WPS442

    if args.traced_model is not None:
        device = utils.get_device()
        model = torch.jit.load(str(args.traced_model), map_location=device)
    else:
        model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
        model = model.eval()
        model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())

    open_fn = ImageReader(
        input_key=args.img_col, output_key="image", rootpath=args.rootpath
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer,
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            batch_features = model(batch["image"].to(device))
            batch_features = batch_features.cpu().detach().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
Ejemplo n.º 12
0
def main(args):
    logdir = "./logdir"
    num_epochs = 42

    # detect gpu
    device = utils.get_device()
    utils.fp
    print(f"device: {device}")

    # dataset
    trainset = ImageNetK(
        '/run/media/mooziisp/仓库/datasets/Kaggle-ILSVRC/ILSVRC',
        split='train',
        transform=transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.ToTensor()
        ]))
    trainloader = torch.utils.data.DataLoader(trainset,
                                              batch_size=64,
                                              shuffle=True,
                                              num_workers=2,
                                              pin_memory=True)

    loaders = {"train": trainloader}

    # define net
    net = models.resnet18(pretrained=False, num_classes=1000)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=1e-4)

    # trainer
    runner = SupervisedRunner(device=device)
    runner.train(model=net,
                 criterion=criterion,
                 optimizer=optimizer,
                 loaders=loaders,
                 logdir=logdir,
                 callbacks=[AccuracyCallback(num_classes=1000)],
                 num_epochs=num_epochs,
                 verbose=True)
Ejemplo n.º 13
0
def setup_runtime(cfg_env: DictConfig):
    """
    Setup runtime environment.
    Runtime options:
        ["cuda", "cuda:0", "cuda:<index>", "cpu", ""]
    Args:
        cfg_env (dict): Configuration

    Returns:
        None
    """
    runtime: str = cfg_env.runtime

    runtime_name, runtime_devices = \
        runtime.split(":") if ":" in runtime else [runtime, ""]

    if runtime_name == "cuda" and runtime_devices:
        os.environ["CUDA_VISIBLE_DEVICES"] = f"{runtime_devices}"
        logger.info(f"[Environment] Configuration: CUDA_VISIBLE_DEVICES="
                    f"{os.environ['CUDA_VISIBLE_DEVICES']}")
    elif runtime_name == "cpu":
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        logger.info(f"[Environment] Configuration: CUDA_VISIBLE_DEVICES="
                    f"{os.environ['CUDA_VISIBLE_DEVICES']}")

    from catalyst.utils import set_global_seed, prepare_cudnn, get_device

    seed: int = cfg_env.seed

    set_global_seed(seed)
    logger.info(f"[Environment] Configuration. Seed: {seed}")

    prepare_cudnn(deterministic=True, benchmark=False)
    logger.info(f"[Environment] Configuration. CUDNN: "
                f"deterministic=True, benchmark=False")

    device = get_device()

    logger.info(f"[Environment] Runtime: {device}")

    return device
Ejemplo n.º 14
0
# This function removes weight_decay for biases and applies our layerwise_params
model_params = utils.process_model_params(model,
                                          layerwise_params=layerwise_params)

# Catalyst has new SOTA optimizers out of box
base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003)
optimizer = Lookahead(base_optimizer)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                 factor=0.25,
                                                 patience=2)

num_epochs = 3
logdir = "./logs/segmentation"

device = utils.get_device()
print(f"device: {device}")

# by default SupervisedRunner uses "features" and "targets",
# in our case we get "image" and "mask" keys in dataset __getitem__
runner = SupervisedRunner(device=device,
                          input_key="image",
                          input_target_key="mask")

SEED = config.SEED
utils.set_global_seed(SEED)
utils.prepare_cudnn(deterministic=True)

runner.train(
    model=model,
    criterion=criterion,
Ejemplo n.º 15
0
def process_components(
    model: Model,
    criterion: Criterion = None,
    optimizer: Optimizer = None,
    scheduler: Scheduler = None,
    distributed_params: Dict = None,
    device: Device = None,
) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]:
    """
    Returns the processed model, criterion, optimizer, scheduler and device

    Args:
        model (Model): torch model
        criterion (Criterion): criterion function
        optimizer (Optimizer): optimizer
        scheduler (Scheduler): scheduler
        distributed_params (dict, optional): dict with the parameters
            for distributed and FP16 methond
        device (Device, optional): device
    """
    distributed_params = distributed_params or {}
    distributed_params = copy.deepcopy(distributed_params)
    distributed_params.update(get_distributed_params())
    if device is None:
        device = utils.get_device()

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    if utils.is_wrapped_with_ddp(model):
        pass
    elif get_rank() >= 0:
        assert isinstance(model, nn.Module)
        local_rank = distributed_params.pop("local_rank", 0)
        device = f"cuda:{local_rank}"
        model = utils.maybe_recursive_call(model, "to", device=device)

        syncbn = distributed_params.pop("syncbn", False)
        use_apex = distributed_params.pop("apex", True) and is_apex_available()

        if use_apex:
            import apex
            amp_params = get_default_params(apex.amp.initialize,
                                            ["models", "optimizers"])
            amp_params["opt_level"] = "O0"
            for dp in distributed_params:
                if dp in amp_params:
                    amp_params[dp] = distributed_params[dp]

            amp_result = apex.amp.initialize(model, optimizer, **amp_params)
            if optimizer is not None:
                model, optimizer = amp_result
            else:
                model = amp_result

            model = apex.parallel.DistributedDataParallel(model)

            if syncbn:
                model = apex.parallel.convert_syncbn_model(model)
        else:
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)
    elif torch.cuda.device_count() > 1:
        if isinstance(model, nn.Module):
            model = torch.nn.DataParallel(model)
        elif isinstance(model, dict):
            model = {k: torch.nn.DataParallel(v) for k, v in model.items()}

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    return model, criterion, optimizer, scheduler, device
def run_ml_pipeline(sampler_inbatch: data.IInbatchTripletSampler) -> float:
    """
    Full metric learning pipeline, including train and val.

    This function is also used as minimal example in README.md, section name:
    'CV - MNIST with Metric Learning'.

    Args:
        sampler_inbatch: sampler to forming triplets

    Returns:
        best metric value
    """
    # 1. train and valid datasets
    dataset_root = "./data"
    transforms = t.Compose([t.ToTensor(), t.Normalize((0.1307, ), (0.3081, ))])

    dataset_train = datasets.MnistMLDataset(
        root=dataset_root,
        train=True,
        download=True,
        transform=transforms,
    )
    sampler = data.BalanceBatchSampler(labels=dataset_train.get_labels(),
                                       p=5,
                                       k=10)
    train_loader = DataLoader(dataset=dataset_train,
                              sampler=sampler,
                              batch_size=sampler.batch_size)

    dataset_val = datasets.MnistQGDataset(root=dataset_root,
                                          transform=transforms,
                                          gallery_fraq=0.2)
    val_loader = DataLoader(dataset=dataset_val, batch_size=1024)

    # 2. model and optimizer
    model = models.SimpleConv(features_dim=16)
    optimizer = Adam(model.parameters(), lr=0.0005)

    # 3. criterion with triplets sampling
    criterion = nn.TripletMarginLossWithSampler(
        margin=0.5, sampler_inbatch=sampler_inbatch)

    # 4. training with catalyst Runner
    callbacks = [
        dl.ControlFlowCallback(dl.CriterionCallback(), loaders="train"),
        dl.ControlFlowCallback(dl.CMCScoreCallback(topk_args=[1]),
                               loaders="valid"),
        dl.PeriodicLoaderCallback(valid=100),
    ]

    runner = dl.SupervisedRunner(device=utils.get_device())
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        callbacks=callbacks,
        loaders={
            "train": train_loader,
            "valid": val_loader
        },
        minimize_metric=False,
        verbose=True,
        valid_loader="valid",
        num_epochs=100,
        main_metric="cmc01",
    )
    return runner.best_valid_metrics["cmc01"]
Ejemplo n.º 17
0
def main():
    # set your params
    DATA_PATH = '/content/drive/My Drive/kaggle/bengaliai-cv19/dataset'
    # MODEL_PATH = '/content/drive/My Drive/kaggle/bengaliai-cv19/model/se_resnext50_32x4d-a260b3a4.pth'
    # MODEL_PATH='/content/drive/My Drive/kaggle/bengaliai-cv19/model/efficientnet-b3-5fb5a3c3.pth'
    BASE_LOGDIR = '/content/drive/My Drive/kaggle/bengaliai-cv19/logs'
    NUM_FOLDS = 5
    BATCH_SIZE = 64
    EPOCHS = 20
    SEED = 1234
    SIZE = 224
    LR = 0.003
    HOLD_OUT = False

    # fix seed
    set_global_seed(SEED)

    # read dataset
    train, _, _ = read_data(DATA_PATH)
    train_all_images = prepare_image(DATA_PATH,
                                     data_type='train',
                                     submission=False)

    # init
    target_col = ['grapheme_root', 'consonant_diacritic', 'vowel_diacritic']
    device = get_device()
    train_data_transforms = albu.Compose([
        albu.ShiftScaleRotate(rotate_limit=10, scale_limit=.1),
        albu.Cutout(p=0.5),
    ])
    test_data_transforms = None

    # cross validation
    kf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED)
    ids = kf.split(X=train_all_images, y=train[target_col].values)
    # fold_scores = []
    for fold, (train_idx, valid_idx) in enumerate(ids):
        print("Current Fold: ", fold + 1)
        logdir = os.path.join(BASE_LOGDIR, 'fold_{}'.format(fold + 1))
        os.makedirs(logdir, exist_ok=True)

        train_df, valid_df = train.iloc[train_idx], train.iloc[valid_idx]
        print("Train and Valid Shapes are", train_df.shape, valid_df.shape)

        print("Preparing train datasets....")
        train_dataset = BengaliAIDataset(images=train_all_images[train_idx],
                                         labels=train_df[target_col].values,
                                         size=SIZE,
                                         transforms=train_data_transforms)

        print("Preparing valid datasets....")
        valid_dataset = BengaliAIDataset(images=train_all_images[valid_idx],
                                         labels=valid_df[target_col].values,
                                         size=SIZE,
                                         transforms=test_data_transforms)

        print("Preparing dataloaders datasets....")
        train_loader = DataLoader(train_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=False)
        loaders = {'train': train_loader, 'valid': valid_loader}

        # release memory
        del train_df, valid_df, train_dataset, valid_dataset
        gc.collect()
        torch.cuda.empty_cache()

        # init models
        resnet34 = pretrainedmodels.__dict__["resnet34"](pretrained="imagenet")
        model = BengaliBaselineClassifier(pretrainedmodels=resnet34, hdim=512)
        # model = BengaliBaselineClassifier(pretrainedmodels=se_resnext50_32x4d(model_path=MODEL_PATH))
        # model = CustomEfficientNet.from_pretrained('efficientnet-b3', MODEL_PATH)
        model = model.to(device)
        criterions = {'train': BaselineLoss(), 'valid': BaselineLoss()}
        optimizer = AdamW(model.parameters(), lr=LR)
        scheduler = OneCycleLRWithWarmup(optimizer,
                                         num_steps=EPOCHS,
                                         lr_range=(0.001, 0.0001),
                                         warmup_steps=1)

        # catalyst trainer
        runner = BengaliRunner(device=device)
        # model training
        runner.train(model=model,
                     criterions=criterions,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     loaders=loaders,
                     logdir=logdir,
                     num_epochs=EPOCHS,
                     score_func=macro_recall)

        # release memory
        del model, runner, train_loader, valid_loader, loaders
        gc.collect()
        torch.cuda.empty_cache()

        if HOLD_OUT is True:
            break

    return True
Ejemplo n.º 18
0
def process_components(
    model: Model,
    criterion: Criterion = None,
    optimizer: Optimizer = None,
    scheduler: Scheduler = None,
    distributed_params: Dict = None,
    device: Device = None,
) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]:
    """
    Returns the processed model, criterion, optimizer, scheduler and device

    Args:
        model (Model): torch model
        criterion (Criterion): criterion function
        optimizer (Optimizer): optimizer
        scheduler (Scheduler): scheduler
        distributed_params (dict, optional): dict with the parameters
            for distributed and FP16 methond
        device (Device, optional): device
    """
    distributed_params = distributed_params or {}
    distributed_params = copy.deepcopy(distributed_params)
    if device is None:
        device = utils.get_device()

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    if utils.is_wrapped_with_ddp(model):
        pass
    elif len(distributed_params) > 0:
        assert isinstance(model, nn.Module)
        distributed_rank = distributed_params.pop("rank", -1)
        syncbn = distributed_params.pop("syncbn", False)

        if distributed_rank > -1:
            torch.cuda.set_device(distributed_rank)
            torch.distributed.init_process_group(backend="nccl",
                                                 init_method="env://")

        if "opt_level" in distributed_params:
            utils.assert_fp16_available()
            from apex import amp

            amp_result = amp.initialize(model, optimizer, **distributed_params)
            if optimizer is not None:
                model, optimizer = amp_result
            else:
                model = amp_result

            if distributed_rank > -1:
                from apex.parallel import DistributedDataParallel
                model = DistributedDataParallel(model)

                if syncbn:
                    from apex.parallel import convert_syncbn_model
                    model = convert_syncbn_model(model)

        if distributed_rank <= -1 and torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    elif torch.cuda.device_count() > 1:
        if isinstance(model, nn.Module):
            model = torch.nn.DataParallel(model)
        elif isinstance(model, dict):
            model = {k: torch.nn.DataParallel(v) for k, v in model.items()}

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    return model, criterion, optimizer, scheduler, device
Ejemplo n.º 19
0
def main():
    # Enable argument parsing for file paths
    args = vars(get_args())

    train_images_path = args["train_images"]
    train_masks_path = args["train_masks"]
    test_images_path = args["test_images"]
    test_masks_path = args["test_masks"]

    # print out yaml file configuration
    dir_path = os.path.dirname(os.path.realpath(__file__))
    yaml_path = os.path.join(dir_path, "config/igvc.yaml")
    ARCH = yaml.safe_load(open(yaml_path, "r"))

    # Set a seed for reproducibility
    utils.set_global_seed(ARCH["train"]["seed"])
    utils.prepare_cudnn(deterministic=ARCH["train"]["cudnn"])

    # Set up U-Net with pretrained EfficientNet backbone
    model = smp.Unet(
        encoder_name=ARCH["encoder"]["name"],
        encoder_weights=ARCH["encoder"]["weight"],
        classes=ARCH["train"]["classes"],
        activation=ARCH["encoder"]["activation"],
    )

    # Get Torch loaders
    loaders = get_loaders(
        images=np.load(train_images_path),
        masks=np.load(train_masks_path),
        image_arr_path=train_images_path,
        mask_arr_path=train_masks_path,
        random_state=ARCH["train"]["random_state"],
        valid_size=ARCH["train"]["valid_size"],
        batch_size=ARCH["train"]["batch_size"],
        num_workers=ARCH["train"]["num_workers"],
    )

    # Optimize for cross entropy using Adam
    criterion = {
        "CE": CrossentropyND(),
    }

    optimizer = AdamW(
        model.parameters(),
        lr=ARCH["train"]["lr"],
        betas=(ARCH["train"]["betas_min"], ARCH["train"]["betas_max"]),
        eps=float(ARCH["train"]["eps"]),
        weight_decay=ARCH["train"]["w_decay"],
        amsgrad=ARCH["train"]["amsgrad"],
    )

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=ARCH["train"]["optim_factor"],
        patience=ARCH["train"]["optim_patience"],
    )

    device = utils.get_device()
    print("Using device: {}".format(device))
    print(f"torch: {torch.__version__}, catalyst: {catalyst.__version__}")

    runner = SupervisedRunner(device=device,
                              input_key="image",
                              input_target_key="mask")

    # Use Catalyst callbacks for metric calculations during training
    callbacks = [
        CriterionCallback(input_key="mask", prefix="loss", criterion_key="CE"),
        MulticlassDiceMetricCallback(input_key="mask"),
    ]

    # Train and print model training logs
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=callbacks,
        logdir=ARCH["train"]["logdir"],
        num_epochs=ARCH["train"]["epochs"],
        main_metric="loss",
        minimize_metric=ARCH["train"]["minimize_metric"],
        fp16=ARCH["train"]["fp16"],
        verbose=ARCH["train"]["verbose"],
    )

    # Test model on test dataset
    test_data = SegmentationDataset(test_images_path, test_masks_path)
    infer_loader = DataLoader(
        test_data,
        batch_size=ARCH["test"]["batch_size"],
        shuffle=ARCH["test"]["shuffle"],
        num_workers=ARCH["test"]["num_workers"],
    )

    # Get model predictions on test dataset
    predictions = np.vstack(
        list(
            map(
                lambda x: x["logits"].cpu().numpy(),
                runner.predict_loader(
                    loader=infer_loader,
                    resume=f"content/full_model2/checkpoints/best.pth",
                ),
            )))

    save_result(predictions, test_data)
Ejemplo n.º 20
0
def smart_way():
    args = parse_arguments()
    SEED = args.seed
    ROOT = Path(args.dataset)

    img_paths, targets = retrieve_dataset(ROOT)

    train_transforms = compose(
        [resize_transforms(),
         hard_transforms(),
         post_transforms()])
    valid_transforms = compose([pre_transforms(), post_transforms()])
    loaders = get_loaders(
        img_paths=img_paths,
        targets=targets,
        random_state=SEED,
        batch_size=8,
        train_transforms_fn=train_transforms,
        valid_transforms_fn=valid_transforms,
    )

    logdir = './table_recognition/nn/regression/logs6/'

    model = torch.load(
        f'./table_recognition/nn/segmentation/logs/resnet18_PSPNet/save/best_model.pth'
    )
    model: RegressionFromSegmentation = RegressionFromSegmentation(model)
    model.to(utils.get_device())

    learning_rate = 0.001
    encoder_learning_rate = 0.0005

    layerwise_params = {
        "encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003)
    }
    model_params = utils.process_model_params(
        model, layerwise_params=layerwise_params)
    base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003)
    optimizer = Lookahead(base_optimizer)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.25,
                                                     patience=2)

    device = utils.get_device()

    runner = CustomRunner2(device=device)
    runner.train(model=model,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 logdir=logdir,
                 num_epochs=1000,
                 verbose=True,
                 load_best_on_end=True,
                 main_metric='loss')

    best_model_save_dir = os.path.join(logdir, 'save')
    os.makedirs(best_model_save_dir, exist_ok=True)
    torch.save(model, os.path.join(
        best_model_save_dir,
        'best_model.pth'))  # save best model (by valid loss)
    batch = next(iter(loaders["valid"]))
    try:
        runner.trace(
            model=model, batch=batch, logdir=logdir,
            fp16=False)  # optimized version (not all models can be traced)
    except Exception:
        pass
Ejemplo n.º 21
0
def main():
    # hyper param
    # TODO: set your params
    num_folds = 5
    seed = 1234
    base_dataset_path = '/content/drive/My Drive/kaggle/google-quest-challenge/dataset'
    batch_size = 4
    num_epochs = 4
    bert_model = 'bert-base-uncased'
    base_logdir = '/kaggle/google_quest/bert'

    # fix seed
    set_global_seed(seed)
    device = get_device()

    # set up logdir
    now = datetime.now()
    base_logdir = os.path.join(base_logdir, now.strftime("%Y%m%d%H%M%S"))
    os.makedirs(base_logdir, exist_ok=True)
    # dump this scripts
    my_file_path = os.path.abspath(__file__)
    shutil.copyfile(my_file_path, base_logdir)

    # load dataset
    # TODO: set your dataset
    train, test, sample_submission = read_data(base_dataset_path)
    input_cols = list(train.columns[[1, 2, 5]])
    target_cols = list(train.columns[11:])
    num_labels = len(target_cols)

    # init Bert
    tokenizer = BertTokenizer.from_pretrained(bert_model)

    # execute CV
    # TODO: set your CV method
    kf = GroupKFold(n_splits=num_folds)
    ids = kf.split(train['question_body'], groups=train['question_body'])
    fold_scores = []
    for fold, (train_idx, valid_idx) in enumerate(ids):
        print("Current Fold: ", fold + 1)
        logdir = os.path.join(base_logdir, 'fold_{}'.format(fold + 1))
        os.makedirs(logdir, exist_ok=True)

        # create dataloader
        train_df, val_df = train.iloc[train_idx], train.iloc[valid_idx]
        print("Train and Valid Shapes are", train_df.shape, val_df.shape)

        print("Preparing train datasets....")
        inputs_train = compute_input_arrays(train_df,
                                            input_cols,
                                            tokenizer,
                                            max_sequence_length=512)
        outputs_train = compute_output_arrays(train_df, columns=target_cols)
        lengths_train = np.argmax(inputs_train[0] == 0, axis=1)
        lengths_train[lengths_train == 0] = inputs_train[0].shape[1]

        print("Preparing valid datasets....")
        inputs_valid = compute_input_arrays(val_df,
                                            input_cols,
                                            tokenizer,
                                            max_sequence_length=512)
        outputs_valid = compute_output_arrays(val_df, columns=target_cols)
        lengths_valid = np.argmax(inputs_valid[0] == 0, axis=1)
        lengths_valid[lengths_valid == 0] = inputs_valid[0].shape[1]

        print("Preparing dataloaders datasets....")
        train_set = QuestDataset(inputs=inputs_train,
                                 lengths=lengths_train,
                                 labels=outputs_train)
        train_loader = DataLoader(train_set,
                                  batch_size=batch_size,
                                  shuffle=True)
        valid_set = QuestDataset(inputs=inputs_valid,
                                 lengths=lengths_valid,
                                 labels=outputs_valid)
        valid_loader = DataLoader(valid_set,
                                  batch_size=batch_size,
                                  shuffle=False)

        # init models
        model = CustomBertForSequenceClassification.from_pretrained(
            bert_model, num_labels=num_labels, output_hidden_states=True)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0.05,
            num_training_steps=num_epochs * len(train_loader))

        # model training
        runner = BertRunner(device=device)
        loaders = {'train': train_loader, 'valid': valid_loader}
        print("Model Training....")
        runner.train(model=model,
                     criterion=criterion,
                     optimizer=optimizer,
                     scheduler=scheduler,
                     loaders=loaders,
                     logdir=logdir,
                     num_epochs=num_epochs,
                     score_func=mean_spearmanr_correlation_score)

        # calc valid score
        best_model_path = os.path.join(logdir, 'best_model.pth')
        val_preds = runner.predict_loader(model,
                                          loaders['valid'],
                                          resume=best_model_path)
        val_truth = train[target_cols].iloc[valid_idx].values
        # TODO: set your score function
        cv_score = mean_spearmanr_correlation_score(val_truth, val_preds)
        print('Fold {} CV score : {}'.format(fold + 1, cv_score))
        fold_scores.append(cv_score)

    return True
Ejemplo n.º 22
0
def run(config_file, device_id, idx_fold):
    os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id)
    print('info: use gpu No.{}'.format(device_id))

    config = load_config(config_file)

    # for n-folds loop
    if config.data.params.idx_fold == -1:
        config.data.params.idx_fold = idx_fold
        config.work_dir = config.work_dir + '_fold{}'.format(idx_fold)
    elif config.data.params.idx_fold == 0:
        original_fold = int(config.work_dir.split('_fold')[1])
        if original_fold == idx_fold:
            raise Exception(
                'if you specify fold 0, you should use train.py or resume from fold 1.'
            )
        config.data.params.idx_fold = idx_fold
        config.work_dir = config.work_dir.split('_fold')[0] + '_fold{}'.format(
            idx_fold)
    else:
        raise Exception('you should use train.py if idx_fold is specified.')
    print('info: training for fold {}'.format(idx_fold))

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            df_path=config.data.train_df_path,
            data_dir=config.data.train_dir,
            features=config.data.features,
            phase=phase,
            img_size=(config.data.height, config.data.width),
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            horizontal_flip=config.train.horizontal_flip,
            model_scale=config.data.model_scale,
            debug=config.debug,
            pseudo_path=config.data.pseudo_path,
        )
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    num_features = len(config.data.features)
    print('info: num_features =', num_features)
    model = CenterNetFPN(
        slug=config.model.encoder,
        num_classes=num_features,
    )

    optimizer = get_optimizer(model, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model, device=get_device())

    # train setting
    criterion, callbacks = get_criterion_and_callback(config)

    if config.train.early_stop_patience > 0:
        callbacks.append(
            EarlyStoppingCallback(patience=config.train.early_stop_patience))

    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend(
            [OptimizerCallback(accumulation_steps=accumulation_steps)])

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/last_full.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/last_full.pth'))

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=config.train.fp16,
    )
Ejemplo n.º 23
0
def process_components(
    model: Model,
    criterion: Criterion = None,
    optimizer: Optimizer = None,
    scheduler: Scheduler = None,
    distributed_params: Dict = None,
    device: Device = None,
) -> Tuple[Model, Criterion, Optimizer, Scheduler, Device]:
    """
    Returns the processed model, criterion, optimizer, scheduler and device

    Args:
        model (Model): torch model
        criterion (Criterion): criterion function
        optimizer (Optimizer): optimizer
        scheduler (Scheduler): scheduler
        distributed_params (dict, optional): dict with the parameters
            for distributed and FP16 methond
        device (Device, optional): device
    """
    distributed_params = distributed_params or {}
    distributed_params = copy.deepcopy(distributed_params)
    distributed_params.update(get_distributed_params())
    if device is None:
        device = utils.get_device()

    use_apex = distributed_params.pop("apex", True) and is_apex_available()

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    if utils.is_wrapped_with_ddp(model):
        pass
    # distributed data parallel run (ddp) (with apex support)
    elif get_rank() >= 0:
        assert isinstance(model, nn.Module), \
            "No support for dixtributed KV model yet"

        local_rank = distributed_params.pop("local_rank", 0)
        device = f"cuda:{local_rank}"
        model = utils.maybe_recursive_call(model, "to", device=device)

        syncbn = distributed_params.pop("syncbn", False)

        if use_apex:
            import apex
            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)
            model = apex.parallel.DistributedDataParallel(model)

            if syncbn:
                model = apex.parallel.convert_syncbn_model(model)
        else:
            model = nn.parallel.DistributedDataParallel(
                model, device_ids=[local_rank], output_device=local_rank)
    # data parallel run (dp) (with apex support)
    else:
        # apex issue https://github.com/deepset-ai/FARM/issues/210
        can_use_apex = \
            (use_apex and torch.cuda.device_count() == 1) \
            or (
                    torch.cuda.device_count() > 1
                    and distributed_params.get("opt_level", "O0") == "O1"
            )

        if can_use_apex:
            assert isinstance(model, nn.Module), \
                "No support for apex KV model yet"

            model, optimizer = initialize_apex(model, optimizer,
                                               **distributed_params)

        if torch.cuda.device_count() > 1:
            if isinstance(model, nn.Module):
                model = nn.DataParallel(model)
            elif isinstance(model, dict):
                model = {k: nn.DataParallel(v) for k, v in model.items()}

    model: Model = utils.maybe_recursive_call(model, "to", device=device)

    return model, criterion, optimizer, scheduler, device
Ejemplo n.º 24
0
def simple_way():
    args = parse_arguments()
    SEED = args.seed
    ROOT = Path(args.dataset)

    img_paths, targets = retrieve_dataset(ROOT)

    train_transforms = compose(
        [resize_transforms(),
         hard_transforms(),
         post_transforms()])
    valid_transforms = compose([pre_transforms(), post_transforms()])
    loaders = get_loaders(
        img_paths=img_paths,
        targets=targets,
        random_state=SEED,
        batch_size=8,
        train_transforms_fn=train_transforms,
        valid_transforms_fn=valid_transforms,
    )

    logdir = './table_recognition/nn/regression/logs5/'

    # model = models.resnet18(pretrained=True)
    # model.fc = nn.Linear(model.fc.in_features, 8)
    model = Net(models.resnet18(pretrained=True))

    # model = torch.load(f'{logdir}/save/best_model.pth')
    # model.to('cpu')

    # for batch in loaders['valid']:
    #     tables = model(batch['image'])
    #     for image, table in zip(batch['image'], tables):
    #         image = utils.tensor_to_ndimage(image)
    #         image = (image * 255 + 0.5).astype(int).clip(0, 255).astype('uint8')
    #         img_size = image.shape[:2]
    #         table = get_canonical_4_polygon(table)
    #         mask = find_convex_hull_mask(
    #             img_size,
    #             [(int(x * img_size[0]),
    #               int(y * img_size[1]))
    #              for x, y in table]
    #         ).astype(bool)
    #
    #         plt.figure(figsize=(14, 10))
    #         plt.subplot(1, 2, 1)
    #         plt.imshow(image)
    #         plt.subplot(1, 2, 2)
    #         plt.imshow(mask)
    #         plt.show()

    learning_rate = 0.001
    encoder_learning_rate = 0.0005

    optimizer = optim.Adam(model.fc.parameters(),
                           lr=learning_rate,
                           weight_decay=0.00003)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     factor=0.25,
                                                     patience=2)

    device = utils.get_device()
    # runner = SupervisedRunner(device=device, input_key='image', input_target_key='table')
    runner = CustomRunner(device=device)

    runner.train(model=model,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 logdir=logdir,
                 num_epochs=50,
                 verbose=True,
                 load_best_on_end=True,
                 main_metric='loss')

    best_model_save_dir = os.path.join(logdir, 'save')
    os.makedirs(best_model_save_dir, exist_ok=True)
    torch.save(model, os.path.join(
        best_model_save_dir,
        'best_model.pth'))  # save best model (by valid loss)
    batch = next(iter(loaders["valid"]))
    try:
        runner.trace(
            model=model, batch=batch, logdir=logdir,
            fp16=False)  # optimized version (not all models can be traced)
    except Exception:
        pass
def main(train, test, features, target):
    # get args
    args = parse_arguments()
    params = yaml_to_json(args.yaml_path)

    # hyper param
    num_folds = params.fold
    seed = params.seed
    base_path = params.base_path
    target_cols = params.target
    features_cols = params.features
    preprocessed_data_path = params.preprocessed_data
    batch_size = params.batch_size
    num_epochs = params.epochs
    # ex) '/hoge/logs'
    base_logdir = params.base_logdir

    # fix seed
    set_global_seed(seed)
    device = get_device()

    # set up logdir
    now = datetime.now()
    base_logdir = os.path.join(base_logdir + now.strftime("%Y%m%d%H%M%S"))
    os.makedirs(base_logdir, exist_ok=True)
    # dump yaml contents
    with open(os.path.join(base_logdir, 'params.json'), mode="w") as f:
        json.dump(params, f, indent=4)
    # dump this scripts
    my_file_path = os.path.abspath(__file__)
    shutil.copyfile(my_file_path, base_logdir)

    # load dataset
    if preprocessed_data_path == '':
        train, test, sample_submission = read_data(base_path)  # noqa
        # TODO: You should implement these function!!
        train, test = preprocess(train, test)  # noqa
        train, test = build_feature(train, test)  # noqa
    else:
        train = pd.read_csv(preprocessed_data_path + 'train.csv')
        test = pd.read_csv(preprocessed_data_path + 'test.csv')
        sample_submission = pd.read_csv(preprocessed_data_path +
                                        'sample_submission.csv')

    # execute CV
    # TODO: set your CV method
    kf = KFold(n_splits=num_folds, random_state=seed)
    ids = kf.split(train)
    fold_scores = []
    test_preds = []
    for fold, (train_idx, valid_idx) in enumerate(ids):
        print('Fold {}'.format(fold + 1))

        logdir = os.path.join(base_logdir + 'fold_{}'.format(fold + 1))
        os.makedirs(logdir, exist_ok=True)

        # data
        X_train = train[features_cols]
        # 目的変数の正規化は...?
        Y_train = train[target_cols]
        X_test = train[features_cols]

        # create dataloaders
        train_dls, test_dl = create_data_loader(
            X_train.iloc[train_idx].to_numpy(),
            Y_train.iloc[train_idx].to_numpy(),
            X_train.iloc[valid_idx].to_numpy(),
            Y_train.iloc[valid_idx].to_numpy(),
            X_test.to_numpy(),
            batch_size=batch_size)

        # init models
        # TODO: set your model and learning condition
        # ここは関数を用意して、キーワードで取り出すようにできると汎用性は上がる
        model = SampleNN(input_dim=1000, out_dim=1)
        criterion = nn.BCELoss()
        optimizer = torch.optim.AdamW(model.parameters())
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

        # init catalyst runner
        runner = SupervisedRunner(device=device)
        # model training
        runner.train(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            loaders=train_dls,
            logdir=logdir,
            num_epochs=num_epochs,
            callbacks=[EarlyStoppingCallback(patience=15, min_delta=0)],
            verbose=False)

        # calculate valid score
        best_model_path = logdir + '/checkpoints/best.pth'
        val_preds = runner.predict_loader(model,
                                          train_dls['valid'],
                                          resume=best_model_path,
                                          verbose=False)
        val_truth = Y_train.iloc[valid_idx].values
        # TODO: set your score function
        cv_score = mean_spearmanr_correlation_score(val_truth, val_preds)
        print('Fold {} CV score : {}'.format(fold + 1, cv_score))
        fold_scores.append(cv_score)

        # test prediction
        test_pred = runner.predict_loader(
            model, test_dl, resume=best_model_path, verbose=False) / num_folds
        test_preds.append(test_pred)

    # submit
    # TODO: set your submit process
    sample_submission[target_cols] = np.mean(test_preds, axis=0)
    sample_submission.to_csv('submission.csv')
    return True
Ejemplo n.º 26
0
def run(config_file):
    config = load_config(config_file)
    #set up the environment flags for working with the KAGGLE GPU OR COLAB_GPU
    if 'COLAB_GPU' in os.environ:
        config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir
    elif 'KAGGLE_WORKING_DIR' in os.environ:
        config.work_dir = '/kaggle/working/' + config.work_dir
    print('working directory:', config.work_dir)

    #save the configuration to the working dir
    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    #Enter the GPUS you have,
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    #our dataset has an explicit validation folder, use that later.
    all_transforms['valid'] = get_transforms(config.transforms.test)

    print("before rajat config", config.data.height, config.data.width)
    #fetch the dataloaders we need
    dataloaders = {
        phase: make_loader(data_folder=config.data.train_dir,
                           df_path=config.data.train_df_path,
                           phase=phase,
                           img_size=(config.data.height, config.data.width),
                           batch_size=config.train.batch_size,
                           num_workers=config.num_workers,
                           idx_fold=config.data.params.idx_fold,
                           transforms=all_transforms[phase],
                           num_classes=config.data.num_classes,
                           pseudo_label_path=config.train.pseudo_label_path,
                           debug=config.debug)
        for phase in ['train', 'valid']
    }

    #creating the segmentation model with pre-trained encoder
    '''
    dumping the parameters for smp library
    encoder_name: str = "resnet34",
    encoder_depth: int = 5,
    encoder_weights: str = "imagenet",
    decoder_use_batchnorm: bool = True,
    decoder_channels: List[int] = (256, 128, 64, 32, 16),
    decoder_attention_type: Optional[str] = None,
    in_channels: int = 3,
    classes: int = 1,
    activation: Optional[Union[str, callable]] = None,
    aux_params: Optional[dict] = None,
    '''
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    #fetch the loss
    criterion = get_loss(config)
    params = [
        {
            'params': model.decoder.parameters(),
            'lr': config.optimizer.params.decoder_lr
        },
        {
            'params': model.encoder.parameters(),
            'lr': config.optimizer.params.encoder_lr
        },
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)
    '''
    dumping the catalyst supervised runner
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py

    model (Model): Torch model object
    device (Device): Torch device
    input_key (str): Key in batch dict mapping for model input
    output_key (str): Key in output dict model output
        will be stored under
    input_target_key (str): Key in batch dict mapping for target
    '''

    runner = SupervisedRunner(model=model, device=get_device())

    #@pavel,srk,rajat,vladimir,pudae check the IOU and the Dice Callbacks

    callbacks = [DiceCallback(), IouCallback()]

    #adding patience
    if config.train.early_stop_patience > 0:
        callbacks.append(
            EarlyStoppingCallback(patience=config.train.early_stop_patience))

    #thanks for handling the distributed training
    '''
    we are gonna take zero_grad after accumulation accumulation_steps
    '''
    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend([
            CriterionCallback(),
            OptimizerCallback(accumulation_steps=accumulation_steps)
        ])

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(
            CheckpointCallback(resume=config.work_dir +
                               '/checkpoints/last_full.pth'))
    '''
    pudae добавь пожалуйста обратный вызов
    https://arxiv.org/pdf/1710.09412.pdf
    **srk adding the mixup callback
    '''
    if config.train.mixup:
        callbacks.append(MixupCallback())
    if config.train.cutmix:
        callbacks.append(CutMixCallback())
    '''@rajat implemented cutmix, a wieghed combination of cutout and mixup '''
    callbacks.append(MixupCallback())
    callbacks.append(CutMixCallback())
    '''
    rajat introducing training loop
    https://github.com/catalyst-team/catalyst/blob/master/catalyst/dl/runner/supervised.py
    take care of the nvidias fp16 precision
    '''
    print(config.work_dir)
    print(config.train.minimize_metric)
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=False,
    )
Ejemplo n.º 27
0
def train_segmentation_model(
        model: torch.nn.Module,
        logdir: str,
        num_epochs: int,
        loaders: Dict[str, DataLoader]
):
    criterion = {
        "dice": DiceLoss(),
        "iou": IoULoss(),
        "bce": nn.BCEWithLogitsLoss()
    }

    learning_rate = 0.001
    encoder_learning_rate = 0.0005

    layerwise_params = {"encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003)}
    model_params = utils.process_model_params(model, layerwise_params=layerwise_params)
    base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003)
    optimizer = Lookahead(base_optimizer)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2)

    device = utils.get_device()
    runner = SupervisedRunner(device=device, input_key='image', input_target_key='mask')

    callbacks = [
        CriterionCallback(
            input_key="mask",
            prefix="loss_dice",
            criterion_key="dice"
        ),
        CriterionCallback(
            input_key="mask",
            prefix="loss_iou",
            criterion_key="iou"
        ),
        CriterionCallback(
            input_key="mask",
            prefix="loss_bce",
            criterion_key="bce"
        ),

        MetricAggregationCallback(
            prefix="loss",
            mode="weighted_sum",
            metrics={"loss_dice": 1.0, "loss_iou": 1.0, "loss_bce": 0.8},
        ),

        # metrics
        DiceCallback(input_key='mask'),
        IouCallback(input_key='mask'),
    ]

    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        callbacks=callbacks,
        logdir=logdir,
        num_epochs=num_epochs,
        main_metric="iou",
        minimize_metric=False,
        verbose=True,
        load_best_on_end=True,
    )
    best_model_save_dir = os.path.join(logdir, 'save')
    os.makedirs(best_model_save_dir, True)
    torch.save(model, os.path.join(best_model_save_dir, 'best_model.pth'))   # save best model (by valid loss)
    batch = next(iter(loaders["valid"]))
    try:
        runner.trace(model=model, batch=batch, logdir=logdir, fp16=False)  # optimized version (not all models can be traced)
    except Exception:
        pass
Ejemplo n.º 28
0
from catalyst.utils import get_device
from catalyst.dl.runner import SupervisedRunner
from catalyst.dl.callbacks import DiceCallback, EarlyStoppingCallback

from utils.dataset import CustomDataset

from utils.augmentation import get_validation_augmentation, get_training_augmentation
from utils.losses import WeightedBCEDiceLoss
from utils.callbacks import CometCallback

from models.EffUNet import EffUNet


os.environ["CUDA_VISIBLE_DEVICES"] = ...
device = get_device()

hyper_params = {
    "in_channels": ...,
    "num_classes": ...,
    "batch_size": ...,
    "num_epochs": ...,
    "learning_rate": 1e-3,
    "lambda_dice": 0.5,
    "lambda_bceWithLogits": 1.5,
    "logdir": ...
}

experiment = Experiment(...)

experiment.log_parameters(hyper_params)
Ejemplo n.º 29
0
def run(config_file):
    config = load_config(config_file)
    if 'COLAB_GPU' in os.environ:
        config.work_dir = '/content/drive/My Drive/kaggle_cloud/' + config.work_dir
    elif 'KAGGLE_WORKING_DIR' in os.environ:
        config.work_dir = '/kaggle/working/' + config.work_dir
    print('working directory:', config.work_dir)

    if not os.path.exists(config.work_dir):
        os.makedirs(config.work_dir, exist_ok=True)
    save_config(config, config.work_dir + '/config.yml')

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    all_transforms = {}
    all_transforms['train'] = get_transforms(config.transforms.train)
    all_transforms['valid'] = get_transforms(config.transforms.test)

    dataloaders = {
        phase: make_loader(
            data_folder=config.data.train_dir,
            df_path=config.data.train_df_path,
            phase=phase,
            img_size=(config.data.height, config.data.width),
            batch_size=config.train.batch_size,
            num_workers=config.num_workers,
            idx_fold=config.data.params.idx_fold,
            transforms=all_transforms[phase],
            num_classes=config.data.num_classes,
            pseudo_label_path=config.train.pseudo_label_path,
            debug=config.debug
        )
        for phase in ['train', 'valid']
    }

    # create segmentation model with pre trained encoder
    model = getattr(smp, config.model.arch)(
        encoder_name=config.model.encoder,
        encoder_weights=config.model.pretrained,
        classes=config.data.num_classes,
        activation=None,
    )

    # train setting
    criterion = get_loss(config)
    params = [
        {'params': model.decoder.parameters(), 'lr': config.optimizer.params.decoder_lr},
        {'params': model.encoder.parameters(), 'lr': config.optimizer.params.encoder_lr},
    ]
    optimizer = get_optimizer(params, config)
    scheduler = get_scheduler(optimizer, config)

    # model runner
    runner = SupervisedRunner(model=model, device=get_device())

    callbacks = [DiceCallback(), IouCallback()]

    if config.train.early_stop_patience > 0:
        callbacks.append(EarlyStoppingCallback(
            patience=config.train.early_stop_patience))

    if config.train.accumulation_size > 0:
        accumulation_steps = config.train.accumulation_size // config.train.batch_size
        callbacks.extend(
            [CriterionCallback(),
             OptimizerCallback(accumulation_steps=accumulation_steps)]
        )

    # to resume from check points if exists
    if os.path.exists(config.work_dir + '/checkpoints/best.pth'):
        callbacks.append(CheckpointCallback(
            resume=config.work_dir + '/checkpoints/last_full.pth'))

    if config.train.mixup:
        callbacks.append(MixupCallback())

    if config.train.cutmix:
        callbacks.append(CutMixCallback())

    # model training
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=dataloaders,
        logdir=config.work_dir,
        num_epochs=config.train.num_epochs,
        main_metric=config.train.main_metric,
        minimize_metric=config.train.minimize_metric,
        callbacks=callbacks,
        verbose=True,
        fp16=True,
    )