def summary(self): # torchsummary only supported [cuda, cpu]. not cuda:0 device = str(self.device).split(":")[0] torch_summary( self, input_size=(self._channels, self._height, self._width), device=device, )
def run_experiment( experiment: Experiment, debug_pipeline: bool = False, develop_mode: bool = False, data_loader_workers: int = 1, cross_validation_iterations: int = 3, device: str = "cpu", develop_mode_sampls: int = 10 ) -> List[Result]: LOGGER.info("Beginning experiment: %s, %s", experiment.id(), experiment.description()) pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline) augmentations = AugmentedCollate(experiment.augmentation_stages()) dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers, # can safely be used in each cross validation run cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0) LOGGER.info("Initialised cache: %s", cache) LOGGER.info("Creating APTOSDataset for the following directories: %s", directories) dataset = TorchConcatDataset( [APTOSDataset(df, directory, pipeline, cache) for df, directory in zip(dfs, directories)] ) # To facilitate software development this makes running end to end tests feasible if develop_mode: LOGGER.warn("Running in develop mode, using a fraction of the whole dataset") dataset, _ = torch_random_split(dataset, [develop_mode_sampls, len(dataset) - develop_mode_sampls]) results = [] for cv_iteration in range(1, cross_validation_iterations + 1): LOGGER.info("Cross validation iteration: %s", cv_iteration) with APTOSMonitor(experiment, cv_iteration) as monitor: LOGGER.info(f'tensorboard --logdir "{monitor._summary_writer.log_dir}"') test_size = experiment.test_size() train_ds, test_ds = torch_random_split( dataset, [round((1 - test_size) * len(dataset)), round(test_size * len(dataset))] ) sampler, sampler_kwargs = experiment.sampler() sampler = sampler(train_ds, **sampler_kwargs) train_loader = TorchDataLoader( train_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, # Potentially an unconventional use of collate_fn, but it does make the # train data loader responsible for augmentations which is nice. collate_fn=augmentations, sampler=sampler ) test_loader = TorchDataLoader( test_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, ) model = experiment.model(input_shape=train_ds[0][0].shape) print(torch_summary(model.cuda(), train_ds[0][0].shape)) optimizer_class, optim_kwargs = experiment.optimizer() optimizer = optimizer_class(model.parameters(), **optim_kwargs) lr_scheduler, scheduler_kwargs = experiment.lr_scheduler() lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs) monitor.on_cv_start(train_ds, augmentations) for epoch in range(1, experiment.max_epochs() + 1): LOGGER.info("Epoch: %s", epoch) train(model, train_loader, optimizer, device, monitor) lr_scheduler.step() predictions_proba, predictions, targets, ids, losses = test(model, test_loader, device, monitor) if epoch % 2 == 0: checkpoint = { 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'experiment': experiment.state_dict() } checkpoint_directory = f'results/{experiment.id()}' if not os.path.isdir(checkpoint_directory): os.mkdir(checkpoint_directory) torch.save(checkpoint, os.path.join(checkpoint_directory, f'{cv_iteration}-{epoch}-checkpoint.pth')) monitor.on_cv_end() predictions = predictions.tolist() targets = targets.tolist() results_df = pd.DataFrame({ "experiment_id": [experiment.id() for _ in range(len(targets))], "cross_validation_iteration": [cv_iteration for _ in range(len(targets))], "targets": targets, "predictions": predictions, "id_code": ids }) results.append(Result(experiment, results_df)) # Deletes content on disk... (until experiments have a unique hash this make sense) cache.clear() return results
def run_experiment(experiment: Experiment, debug_pipeline: bool = False, develop_mode: bool = False, data_loader_workers: int = 1, cross_validation_iterations: int = 3, device: str = "cpu", develop_mode_sampls: int = 10) -> List[Result]: LOGGER.info("Beginning experiment: %s, %s", experiment.id(), experiment.description()) #preprocessing pipeline = Pipeline(experiment.pipeline_stages(), debug=debug_pipeline) #augmentations augmentations = AugmentedCollate(experiment.augmentation_stages()) test_augmentations = AugmentedCollate( experiment.test_augmentation_stages()) dfs = experiment.train_test_data_frames() directories = experiment.train_test_directories() # File system cache with a 1 to 1 mapping to an experiment, used to cache data for multiple workers, # can safely be used in each cross validation run cache = joblib.Memory(f'./cachedir/{experiment.id()}', verbose=0) LOGGER.info("Initialised cache: %s", cache) LOGGER.info("Creating APTOSDataset for the following directories: %s", directories) dataset = TorchConcatDataset([ APTOSDataset(df, directory, pipeline, cache) for df, directory in zip(dfs, directories) ]) # To facilitate software development this makes running end to end tests feasible if develop_mode: LOGGER.warn( "Running in develop mode, using a fraction of the whole dataset") dataset, _ = torch_random_split( dataset, [develop_mode_sampls, len(dataset) - develop_mode_sampls]) results = [] # Stratified ShuffleSplit cross-validator, Provides train/test indices to split data in train/test sets. sss = StratifiedShuffleSplit(n_splits=cross_validation_iterations, test_size=experiment.test_size(), train_size=1 - experiment.test_size(), random_state=0) #TODO: will probably need debugging when more than one datasets are added labels = np.asarray([x for x in dfs[0]["diagnosis"]]) split_generator = sss.split(np.zeros(labels.shape), labels) for cv_iteration, (train_index, test_index) in zip( range(1, cross_validation_iterations + 1), split_generator): LOGGER.info("Cross validation iteration: %s", cv_iteration) with APTOSMonitor(experiment, cv_iteration) as monitor: LOGGER.info( f'tensorboard --logdir "{monitor._summary_writer.log_dir}"') test_ds = Subset(dataset, test_index) train_ds = Subset(dataset, train_index) LOGGER.info("train data size: {}".format(train_ds.__len__())) LOGGER.info("Histogram of classses {}".format( np.histogram(labels[train_index], 5))) class_data = np.histogram(labels[train_index], 5)[0] class_weights = class_data.sum() / (class_data.shape[0] * class_data) LOGGER.info("test data size: {}".format(test_ds.__len__())) LOGGER.info("Histogram of classses {}".format( np.histogram(labels[test_index], 5))) sampler, sampler_kwargs = experiment.sampler() sampler = sampler(train_ds, **sampler_kwargs) train_loader = TorchDataLoader( train_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, # Potentially an unconventional use of collate_fn, but it does make the # train data loader responsible for augmentations which is nice. collate_fn=augmentations, sampler=sampler) test_loader = TorchDataLoader(test_ds, batch_size=experiment.batch_size(), num_workers=data_loader_workers, collate_fn=test_augmentations) model = experiment.model(input_shape=train_ds[0][0].shape) print(torch_summary(model.cuda(), train_ds[0][0].shape)) optimizer_class, optim_kwargs = experiment.optimizer() optimizer = optimizer_class(model.parameters(), **optim_kwargs) lr_scheduler, scheduler_kwargs = experiment.lr_scheduler() lr_scheduler = lr_scheduler(optimizer, **scheduler_kwargs) monitor.on_cv_start(train_ds, augmentations) #add parameter alpha for class weights criterion = FocalLoss(num_class=5, gamma=2, alpha=class_weights) for epoch in range(1, experiment.max_epochs() + 1): LOGGER.info("Epoch: %s", epoch) train(model, train_loader, optimizer, device, criterion, monitor) lr_scheduler.step() predictions_proba, predictions, targets, ids, losses = test( model, test_loader, device, criterion, monitor) if epoch % 2 == 0: checkpoint = { 'model': model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'experiment': experiment.state_dict() } checkpoint_directory = f'results/{experiment.id()}' if not os.path.isdir(checkpoint_directory): os.mkdir(checkpoint_directory) torch.save( checkpoint, os.path.join(checkpoint_directory, f'{cv_iteration}-{epoch}-checkpoint.pth')) monitor.on_cv_end() predictions = predictions.tolist() targets = targets.tolist() results_df = pd.DataFrame({ "experiment_id": [experiment.id() for _ in range(len(targets))], "cross_validation_iteration": [cv_iteration for _ in range(len(targets))], "targets": targets, "predictions": predictions, "id_code": ids }) results.append(Result(experiment, results_df)) # Deletes content on disk... (until experiments have a unique hash this make sense) cache.clear() return results