Exemple #1
0
def train_experiment(engine=None):
    with TemporaryDirectory() as logdir:
        utils.set_global_seed(RANDOM_STATE)
        # 1. generate data
        num_samples, num_features, num_classes = int(1e4), int(30), 3
        X, y = make_classification(
            n_samples=num_samples,
            n_features=num_features,
            n_informative=num_features,
            n_repeated=0,
            n_redundant=0,
            n_classes=num_classes,
            n_clusters_per_class=1,
        )
        X, y = torch.tensor(X), torch.tensor(y)
        dataset = TensorDataset(X, y)
        loader = DataLoader(dataset,
                            batch_size=64,
                            num_workers=1,
                            shuffle=True)

        # 2. model, optimizer and scheduler
        hidden_size, out_features = 20, 16
        model = nn.Sequential(
            nn.Linear(num_features, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_features),
        )
        optimizer = Adam(model.parameters(), lr=LR)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])

        # 3. criterion with triplets sampling
        sampler_inbatch = HardTripletsSampler(norm_required=False)
        criterion = TripletMarginLossWithSampler(
            margin=0.5, sampler_inbatch=sampler_inbatch)

        # 4. training with catalyst Runner
        class CustomRunner(dl.SupervisedRunner):
            def handle_batch(self, batch) -> None:
                features, targets = batch["features"].float(
                ), batch["targets"].long()
                embeddings = self.model(features)
                self.batch = {
                    "embeddings": embeddings,
                    "targets": targets,
                }

        callbacks = [
            dl.SklearnModelCallback(
                feature_key="embeddings",
                target_key="targets",
                train_loader="train",
                valid_loaders="valid",
                model_fn=RandomForestClassifier,
                predict_method="predict_proba",
                predict_key="sklearn_predict",
                random_state=RANDOM_STATE,
                n_estimators=100,
            ),
            dl.ControlFlowCallbackWrapper(
                dl.AccuracyCallback(target_key="targets",
                                    input_key="sklearn_predict",
                                    topk=(1, 3)),
                loaders="valid",
            ),
        ]

        runner = CustomRunner(input_key="features", output_key="embeddings")
        runner.train(
            engine=engine,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            callbacks=callbacks,
            scheduler=scheduler,
            loaders={
                "train": loader,
                "valid": loader
            },
            verbose=False,
            valid_loader="valid",
            valid_metric="accuracy01",
            minimize_valid_metric=False,
            num_epochs=TRAIN_EPOCH,
            logdir=logdir,
        )

        best_accuracy = max(
            epoch_metrics["valid"]["accuracy01"]
            for epoch_metrics in runner.experiment_metrics.values())

        assert best_accuracy > 0.9
    # define criterion
    criterion = BarlowTwinsLoss(offdiag_lambda=args.offdig_lambda)

    # and callbacks
    callbacks = [
        dl.CriterionCallback(
            input_key="projection_left", target_key="projection_right", metric_key="loss"
        ),
        dl.BackwardCallback(metric_key="loss"),
        dl.OptimizerCallback(metric_key="loss"),
        dl.SklearnModelCallback(
            feature_key="embedding_origin",
            target_key="target",
            train_loader="train",
            valid_loaders="valid",
            model_fn=LogisticRegression,
            predict_key="sklearn_predict",
            predict_method="predict_proba",
            C=0.1,
            solver="saga",
            max_iter=200,
        ),
        dl.ControlFlowCallbackWrapper(
            dl.AccuracyCallback(
                target_key="target", input_key="sklearn_predict", topk=(1, 3)
            ),
            loaders="valid",
        ),
    ]

    # train model
    runner = SelfSupervisedRunner()
def train_experiment(device, engine=None):
    with TemporaryDirectory() as logdir:
        from catalyst import utils

        utils.set_global_seed(RANDOM_STATE)
        # 1. train, valid and test loaders
        transforms = Compose([ToTensor(), Normalize((0.1307, ), (0.3081, ))])

        train_data = MNIST(os.getcwd(),
                           train=True,
                           download=True,
                           transform=transforms)
        train_labels = train_data.targets.cpu().numpy().tolist()
        train_sampler = data.BatchBalanceClassSampler(train_labels,
                                                      num_classes=10,
                                                      num_samples=4)
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)

        valid_dataset = MNIST(root=os.getcwd(),
                              transform=transforms,
                              train=False,
                              download=True)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=32)

        test_dataset = MNIST(root=os.getcwd(),
                             transform=transforms,
                             train=False,
                             download=True)
        test_loader = DataLoader(dataset=test_dataset, batch_size=32)

        # 2. model and optimizer
        model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 16),
                              nn.LeakyReLU(inplace=True))
        optimizer = Adam(model.parameters(), lr=LR)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])

        # 3. criterion with triplets sampling
        sampler_inbatch = data.HardTripletsSampler(norm_required=False)
        criterion = nn.TripletMarginLossWithSampler(
            margin=0.5, sampler_inbatch=sampler_inbatch)

        # 4. training with catalyst Runner
        class CustomRunner(dl.SupervisedRunner):
            def handle_batch(self, batch) -> None:
                images, targets = batch["features"].float(
                ), batch["targets"].long()
                features = self.model(images)
                self.batch = {
                    "embeddings": features,
                    "targets": targets,
                }

        callbacks = [
            dl.ControlFlowCallback(
                dl.CriterionCallback(input_key="embeddings",
                                     target_key="targets",
                                     metric_key="loss"),
                loaders="train",
            ),
            dl.SklearnModelCallback(
                feature_key="embeddings",
                target_key="targets",
                train_loader="train",
                valid_loaders=["valid", "infer"],
                model_fn=RandomForestClassifier,
                predict_method="predict_proba",
                predict_key="sklearn_predict",
                random_state=RANDOM_STATE,
                n_estimators=50,
            ),
            dl.ControlFlowCallback(
                dl.AccuracyCallback(target_key="targets",
                                    input_key="sklearn_predict",
                                    topk_args=(1, 3)),
                loaders=["valid", "infer"],
            ),
        ]

        runner = CustomRunner(input_key="features", output_key="embeddings")
        runner.train(
            engine=engine or dl.DeviceEngine(device),
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            callbacks=callbacks,
            loaders={
                "train": train_loader,
                "valid": valid_loader,
                "infer": test_loader
            },
            verbose=False,
            valid_loader="valid",
            valid_metric="accuracy",
            minimize_valid_metric=False,
            num_epochs=TRAIN_EPOCH,
            logdir=logdir,
        )

        valid_path = Path(logdir) / "logs/infer.csv"
        best_accuracy = max(
            float(row["accuracy"]) for row in read_csv(valid_path))

        assert best_accuracy > 0.8
    valid_loader = DataLoader(test_data,
                              batch_size=batch_size,
                              pin_memory=True)

    callbacks = [
        dl.ControlFlowCallback(
            dl.CriterionCallback(input_key="out_1",
                                 target_key="out_2",
                                 metric_key="loss"),
            loaders="train",
        ),
        dl.SklearnModelCallback(
            feature_key="embeddings",
            target_key="targets",
            train_loader="train",
            valid_loaders="valid",
            model_fn=LogisticRegression,
            predict_key="sklearn_predict",
            predict_method="predict_proba",
        ),
        dl.OptimizerCallback(metric_key="loss"),
        dl.ControlFlowCallback(
            dl.AccuracyCallback(target_key="targets",
                                input_key="sklearn_predict",
                                topk_args=(1, 3)),
            loaders="valid",
        ),
    ]

    model = Model(feature_dim, arch="resnet50")
    criterion = BarlowTwinsLoss(offdiag_lambda=offdig_lambda)
def train_experiment(device, engine=None):

    with TemporaryDirectory() as logdir:

        # 1. data and transforms

        transforms = Compose([
            torchvision.transforms.ToPILImage(),
            torchvision.transforms.RandomCrop((28, 28)),
            torchvision.transforms.RandomVerticalFlip(),
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.ToTensor(),
            Normalize((0.1307, ), (0.3081, )),
        ])

        transform_original = Compose([
            ToTensor(),
            Normalize((0.1307, ), (0.3081, )),
        ])

        mnist = MNIST("./logdir", train=True, download=True, transform=None)
        contrastive_mnist = SelfSupervisedDatasetWrapper(
            mnist,
            transforms=transforms,
            transform_original=transform_original)
        train_loader = torch.utils.data.DataLoader(contrastive_mnist,
                                                   batch_size=BATCH_SIZE)

        mnist_valid = MNIST("./logdir",
                            train=False,
                            download=True,
                            transform=None)
        contrastive_valid = SelfSupervisedDatasetWrapper(
            mnist_valid,
            transforms=transforms,
            transform_original=transform_original)
        valid_loader = torch.utils.data.DataLoader(contrastive_valid,
                                                   batch_size=BATCH_SIZE)

        # 2. model and optimizer
        encoder = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 16),
                                nn.LeakyReLU(inplace=True))
        projection_head = nn.Sequential(
            nn.Linear(16, 16, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(16, 16, bias=True),
        )

        class ContrastiveModel(torch.nn.Module):
            def __init__(self, model, encoder):
                super(ContrastiveModel, self).__init__()
                self.model = model
                self.encoder = encoder

            def forward(self, x):
                emb = self.encoder(x)
                projection = self.model(emb)
                return emb, projection

        model = ContrastiveModel(model=projection_head, encoder=encoder)

        optimizer = Adam(model.parameters(), lr=LR)

        # 3. criterion with triplets sampling
        criterion = NTXentLoss(tau=0.1)

        callbacks = [
            dl.ControlFlowCallback(
                dl.CriterionCallback(input_key="projection_left",
                                     target_key="projection_right",
                                     metric_key="loss"),
                loaders="train",
            ),
            dl.SklearnModelCallback(
                feature_key="embedding_left",
                target_key="target",
                train_loader="train",
                valid_loaders="valid",
                model_fn=RandomForestClassifier,
                predict_method="predict_proba",
                predict_key="sklearn_predict",
                random_state=RANDOM_STATE,
                n_estimators=50,
            ),
            dl.ControlFlowCallback(
                dl.AccuracyCallback(target_key="target",
                                    input_key="sklearn_predict",
                                    topk_args=(1, 3)),
                loaders="valid",
            ),
        ]

        runner = dl.SelfSupervisedRunner()

        logdir = "./logdir"
        runner.train(
            model=model,
            engine=engine or dl.DeviceEngine(device),
            criterion=criterion,
            optimizer=optimizer,
            callbacks=callbacks,
            loaders={
                "train": train_loader,
                "valid": valid_loader
            },
            verbose=False,
            logdir=logdir,
            valid_loader="train",
            valid_metric="loss",
            minimize_valid_metric=True,
            num_epochs=TRAIN_EPOCH,
        )

        valid_path = Path(logdir) / "logs/valid.csv"
        best_accuracy = max(
            float(row["accuracy"]) for row in read_csv(valid_path)
            if row["accuracy"] != "accuracy")

        assert best_accuracy > 0.6
def train_experiment(engine=None):
    with TemporaryDirectory() as logdir:
        utils.set_global_seed(RANDOM_STATE)
        # 1. train, valid and test loaders
        train_data = MNIST(DATA_ROOT, train=True)
        train_labels = train_data.targets.cpu().numpy().tolist()
        train_sampler = BatchBalanceClassSampler(train_labels,
                                                 num_classes=10,
                                                 num_samples=4)
        train_loader = DataLoader(train_data, batch_sampler=train_sampler)

        valid_dataset = MNIST(root=DATA_ROOT, train=False)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=32)

        test_dataset = MNIST(root=DATA_ROOT, train=False)
        test_loader = DataLoader(dataset=test_dataset, batch_size=32)

        # 2. model and optimizer
        model = nn.Sequential(nn.Flatten(), nn.Linear(28 * 28, 16),
                              nn.LeakyReLU(inplace=True))
        optimizer = Adam(model.parameters(), lr=LR)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])

        # 3. criterion with triplets sampling
        sampler_inbatch = HardTripletsSampler(norm_required=False)
        criterion = TripletMarginLossWithSampler(
            margin=0.5, sampler_inbatch=sampler_inbatch)

        # 4. training with catalyst Runner
        class CustomRunner(dl.SupervisedRunner):
            def handle_batch(self, batch) -> None:
                images, targets = batch["features"].float(
                ), batch["targets"].long()
                features = self.model(images)
                self.batch = {
                    "embeddings": features,
                    "targets": targets,
                }

        callbacks = [
            dl.ControlFlowCallbackWrapper(
                dl.CriterionCallback(input_key="embeddings",
                                     target_key="targets",
                                     metric_key="loss"),
                loaders="train",
            ),
            dl.SklearnModelCallback(
                feature_key="embeddings",
                target_key="targets",
                train_loader="train",
                valid_loaders=["valid", "infer"],
                model_fn=RandomForestClassifier,
                predict_method="predict_proba",
                predict_key="sklearn_predict",
                random_state=RANDOM_STATE,
                n_estimators=50,
            ),
            dl.ControlFlowCallbackWrapper(
                dl.AccuracyCallback(target_key="targets",
                                    input_key="sklearn_predict",
                                    topk=(1, 3)),
                loaders=["valid", "infer"],
            ),
        ]

        runner = CustomRunner(input_key="features", output_key="embeddings")
        runner.train(
            engine=engine,
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            callbacks=callbacks,
            loaders={
                "train": train_loader,
                "valid": valid_loader,
                "infer": test_loader
            },
            verbose=False,
            valid_loader="valid",
            valid_metric="accuracy01",
            minimize_valid_metric=False,
            num_epochs=TRAIN_EPOCH,
            logdir=logdir,
        )

        best_accuracy = max(
            epoch_metrics["infer"]["accuracy01"]
            for epoch_metrics in runner.experiment_metrics.values())

        assert best_accuracy > 0.8