def __init__(self, logdir, **kwargs): """ Get the SummaryWriter singleton instance or create a new one if not exisiting yet. This class writes summaries to TensorBoard and MLFlow. It is designed as a singleton, so that each part of the program uses the same instance. Once constructed the constructor always returns the same instance regardless of parameters passed. The summary writer will not write to disk when debugging is detected to avoid cluttering the results directory. :param logdir: name of log directory :param kwargs: kwargs for underlying TensorBoardX summary writer. """ if not os.path.exists(LOG_ROOT): os.makedirs(LOG_ROOT) dirs = sorted(os.listdir(LOG_ROOT), reverse=True) num = 0 for d in dirs: if logdir in d: num = int(d[:3]) + 1 break logdir = str(num).zfill(3) + '_' + logdir logdir = os.path.join(LOG_ROOT, logdir) if utils.is_debugging(): warnings.warn( 'Debugging mode: will write to temporary TensorBoard file.', UserWarning) logdir = utils.build_tmp_dir() super(SummaryWriter, self).__init__(logdir, flush_secs=60, **kwargs)
def __init__(self, train_data, eval_data, batch_size, shuffle=True, dvc_file=None): """ Construct a new dataset. This dataset contains the training and evaluation data for a dataset. It can return a dataloader for each of them with the predetermined batch size and shuffle. The DVC file of the dataset on disk will be logged to mlflow as an artiact. :param train_data: torch.Dataset of the training data :param eval_data: torch.Dataset of the evaluation data :param batch_size: default batch size :param shuffle: default shuffle :param dvc_file: dvc file path of data """ self.train_data = train_data self.eval_data = eval_data self.batch_size = batch_size self.shuffle = shuffle self.dvc_file = dvc_file if self.dvc_file is not None and not utils.is_debugging(): dvc_file = dvc_file if isinstance(dvc_file, tuple) else (dvc_file, ) for f in dvc_file: mlflow.log_artifact(f, artifact_path='data_version')
def __init__(self, file_list_path, train_data=True, has_label=True, transform=None, split=0.8): df_train = pd.read_csv(file_list_path) df_value = df_train.values if has_label: split_index = int(df_value.shape[0] * split) if train_data: split_data = df_value[:split_index] else: split_data = df_value[split_index:] if utils.is_debugging(): split_data = df_value[:64] # print(split_data.shape) file_names = [None] * split_data.shape[0] labels = [None] * split_data.shape[0] for index, line in enumerate(split_data): f, invasive = line file_names[index] = os.path.join(settings.TRAIN_DIR, str(f) + '.jpg') labels[index] = invasive self.labels = np.array(labels, dtype=np.float32) else: file_names = [None] * df_train.values.shape[0] for index, line in enumerate(df_train.values): f, invasive = line file_names[index] = settings.TEST_DIR + '/' + str( int(f)) + '.jpg' # print(filenames[:100]) if utils.is_debugging(): file_names = file_names[:64] self.transform = transform self.num = len(file_names) self.file_names = file_names self.train_data = train_data self.has_label = has_label self.images = [] print("pre-reading images from files.") for file_name in tqdm.tqdm(file_names): self.images.append(pil_load(file_name)) print("load %d images." % len(self.images))
def start_dashboard(self): if utils.is_debugging(): # TODO: Deal with plot UI not being in the main thread somehow - (move to browser?) log.warning('Dashboard not supported in debug mode') return q = Queue(maxsize=10) p = Process(target=dashboard_fn, args=(q, )) print('DEBUG - after starting dashboard') p.start() self.dashboard_process = p self.dashboard_queue = q
def close(self): """Close the event file and add it with the model files to MLFlow.""" super().close() if not utils.is_debugging(): files = os.listdir(self.log_dir) event_file = [f for f in files if f.startswith('events')][0] model_files = [f for f in files if f.endswith('.pth')] mlflow.log_artifact(os.path.join(self.log_dir, event_file), artifact_path='events') for m in model_files: mlflow.log_artifact(os.path.join(self.log_dir, m), artifact_path='models')
def write_results(self, results, time_step, scalar_tab=''): """ Write dictionary of results to TensorBoard and MLFlow. This convenience function takes a dictionary that specifies several different summaries to be written. The summaries will be passed to the right add_* function of the summary writer. Scalars will be logged to MLFlow, too. example: {'scalars': {'metric1': 0.1, 'metric2': 0.5}, 'images': {'img1': img_tensor}} :param results: dictionary of summaries :param time_step: time step to log for :param scalar_tab: prefix to use right tab in scalar overview in TensorBoard """ if not scalar_tab.endswith('/'): scalar_tab += '/' if 'scalars' in results: for tag, scalar in results['scalars'].items(): self.add_scalar(tag=scalar_tab + tag, scalar_value=scalar, global_step=time_step) if not utils.is_debugging(): mlflow_tag = (scalar_tab + tag).replace('/', '_') mlflow.log_metric(mlflow_tag, scalar, time_step) if 'images' in results: for tag, image in results['images'].items(): if image.dim() == 2: formats = 'HW' elif image.dim() == 3: formats = 'CHW' if image.shape[0] in [1, 3] else 'HWC' elif image.dim() == 4: formats = 'NCHW' if image.shape[1] in [1, 3] else 'NHWC' else: raise ValueError('Unknown image format with shape %s' % str(image.shape)) self.add_images(tag, image, time_step, dataformats=formats) if 'series' in results: for tag, series in results['series'].items(): plot = self._plot_series(tag, series) self.add_figure(tag, plot, time_step, close=True) if 'embeddings' in results: for tag, embedding in results['embeddings'].items(): self.add_embedding(**embedding, tag=tag, global_step=time_step)
def save(self, obj, name, tag): """ Save a model to file. The model is saved to the log directory as 'name_tag.pth' :param obj: model to save :param name: name of the save file :param tag: prefix tag """ if utils.is_debugging(): warnings.warn( 'Debugging mode: will save model checkpoint to temporary dir.', UserWarning) file_name = os.path.join(self.log_dir, name + '_' + tag + '.pth') torch.save(obj, file_name)
def add_data(df_values, dir_path, label_threshold=None): data_set = [] count = 0 for line in tqdm.tqdm(df_values): image_name, invasive = line image_path = os.path.join(dir_path, str(int(image_name)) + '.jpg') if label_threshold is not None: if invasive >= label_threshold: label = 1.0 else: label = 0 else: label = invasive image_data = ImageData(image_path, np.float32(label)) image_data.image = pil_load(image_data.path) data_set.append(image_data) count += 1 if utils.is_debugging() and count == 20: print("break image pre-reads for debugging purpose.") break return data_set
def run(config, device, epochs, replications, seed, num_data_workers): """ Run an experiment of the given config. A MLFlow experiment will be set according to the name in the config. A BaseTask will be build and the train function called. Each call of the run function with the same config will be a run of this experiment. If replications is set to a number bigger than one, a nested run is created and the task executed this number of times. When debugging, nothing is written to disk to avoid cluttering the results directory. :param config: path to the config JSON file or config dict :param device: device to train on :param epochs: epochs to train for :param replications: number of times to replicate this run :param seed: random seed to use :param num_data_workers: number of worker threads for data loading """ # Set seed for randomization if seed is not None: # Make PyTorch and numpy deterministic torch.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(seed) print('Fixed randomization. Seed %d' % seed) print('#' * 40) else: # Retrieve default seed, as it is not set seed = np.random.randint(np.iinfo(np.int32).max) torch.manual_seed(seed) np.random.seed(seed) # Load config JSON if isinstance(config, str): print('Run experiment from %s' % config) print('#' * 40) config = utils.read_config(config) elif isinstance(config, dict): print('Run experiment with dict named %s' % config['name']) print('#' * 40) else: raise ValueError( 'Config has to be either a string path or a dict, but is %s.' % str(type(dict))) # Extract config dicts for components name = config['name'] dataset = config['dataset'] model = config['model'] trainer = config['trainer'] metrics = config['metrics'] # Setup mlflow experiment if utils.is_debugging(): # Reroute mlflow to tmp file on debugging warnings.warn( 'Debugging mode: MLFlow stuff will be saved to temporary dir.', UserWarning) mlflow.set_tracking_uri('file:' + utils.build_tmp_dir()) else: script_path = os.path.dirname(__file__) root_path = os.path.dirname(script_path) mlflow.set_tracking_uri('file:' + root_path) mlflow.set_experiment(name) # Start the top level run nest_runs = True if replications > 0 else False with mlflow.start_run(nested=nest_runs): # Log parameters to run utils.log_config(config) mlflow.log_param('max_epochs', epochs) mlflow.log_param('seed', seed) mlflow.set_tag('device', device) if nest_runs: # Open child runs for each replication mlflow.log_param('replications', replications) seeds = np.random.randint(np.iinfo(np.int32).max, size=replications) for i, s in enumerate(seeds): print('Run replication %d/%d...' % (i, replications)) with mlflow.start_run(nested=True): # Log params to child runs utils.log_config(config) mlflow.set_tag('replication', i) # Set derived seed for child runs to make each reproducible mlflow.log_param('seed', s) torch.manual_seed(s) np.random.seed(s) # Execute run task = BaseTask(name, device, dataset, model, trainer, metrics) task.train(epochs, num_data_workers) else: # Simply execute top level run, when replications are zero task = BaseTask(name, device, dataset, model, trainer, metrics) task.train(epochs, num_data_workers)