class BirdsBasicTrainerCV: ''' classdocs ''' # Number of intermediate models to save # during training: MODEL_ARCHIVE_SIZE = 20 # For some tensorboard displays: # for how many epochs in the past # to display data: DISPLAY_HISTORY_LEN = 10 #------------------------------------ # Constructor #------------------- def __init__(self, config_info, device=0, percentage=None, debugging=False): ''' :param config_info: all path and training parameters :type config_info: NeuralNetConfig :param debugging: output lots of debug info :type debugging: bool :param device: number of GPU to use; default is dev 0 if any GPU is available :type device: {None | int} :param percentage: percentage of training data to use :type percentage: {int | float} ''' self.log = LoggingService() if debugging: self.log.logging_level = DEBUG if percentage is not None: # Integrity check: if type(percentage) not in [int, float]: raise TypeError( f"Percentage must be int or float, not {type(percentage)}") if percentage < 1 or percentage > 100: raise ValueError( f"Percentage must be between 1 and 100, not {percentage}") if device is None: device = 0 torch.cuda.set_device(device) else: available_gpus = torch.cuda.device_count() if available_gpus == 0: self.log.info("No GPU available; running on CPU") else: if device > available_gpus - 1: raise ValueError( f"Asked to operate on device {device}, but only {available_gpus} are available" ) torch.cuda.set_device(device) self.curr_dir = os.path.dirname(os.path.abspath(__file__)) try: self.config = self.initialize_config_struct(config_info) except Exception as e: msg = f"During config init: {repr(e)}" self.log.err(msg) raise RuntimeError(msg) from e try: self.root_train_test_data = self.config.getpath( 'Paths', 'root_train_test_data', relative_to=self.curr_dir) except ValueError as e: raise ValueError( "Config file must contain an entry 'root_train_test_data' in section 'Paths'" ) from e self.batch_size = self.config.getint('Training', 'batch_size') self.kernel_size = self.config.getint('Training', 'kernel_size') self.min_epochs = self.config.Training.getint('min_epochs') self.max_epochs = self.config.Training.getint('max_epochs') self.lr = self.config.Training.getfloat('lr') self.net_name = self.config.Training.net_name self.pretrained = self.config.Training.getboolean('pretrained', False) self.num_folds = self.config.Training.getint('num_folds') self.freeze = self.config.Training.getint('freeze', 0) self.to_grayscale = self.config.Training.getboolean( 'to_grayscale', True) self.set_seed(42) self.log.info("Parameter summary:") self.log.info(f"network {self.net_name}") self.log.info(f"pretrained {self.pretrained}") if self.pretrained: self.log.info(f"freeze {self.freeze}") self.log.info(f"min epochs {self.min_epochs}") self.log.info(f"max epochs {self.max_epochs}") self.log.info(f"batch_size {self.batch_size}") self.fastest_device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.device = self.fastest_device self.num_classes = self.find_num_classes(self.root_train_test_data) self.initialize_model() sample_width = self.config.getint('Training', 'sample_width', 400) sample_height = self.config.getint('Training', 'sample_height', 400) self.train_loader = self.get_dataloader(sample_width, sample_height, perc_data_to_use=percentage) self.log.info(f"Expecting {len(self.train_loader)} batches per epoch") num_train_samples = len(self.train_loader.dataset) num_classes = len(self.train_loader.dataset.class_names()) self.log.info( f"Training set contains {num_train_samples} samples across {num_classes} classes" ) self.class_names = self.train_loader.dataset.class_names() log_dir = os.path.join(self.curr_dir, 'runs') raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results') self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir) # Log a few example spectrograms to tensorboard; # one per class: TensorBoardPlotter.write_img_grid( self.writer, self.root_train_test_data, len(self.class_names), # Num of train examples ) # All ResultTally instances are # collected here: (num_folds * num-epochs) # each for training and validation steps. self.step_results = ResultCollection() self.log.debug( f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) try: final_step = self.train() self.visualize_final_epoch_results(final_step) finally: self.close_tensorboard() #------------------------------------ # train #------------------- def train(self): overall_start_time = datetime.datetime.now() # Just for sanity: keep track # of number of batches... total_batch_num = 0 # Note: since we are cross validating, the # data loader's set_epoch() method is only # called once (automatically) during instantiation # of the associated sampler. Moving from split # to split includes shuffling if the caller # specified that. # Training for split_num in range(self.train_loader.num_folds): split_start_time = datetime.datetime.now() self.initialize_model() for epoch in range(self.max_epochs): # Set model to train mode: self.model.train() epoch_start_time = datetime.datetime.now() self.log.info(f"Starting epoch {epoch} training") # Sanity check record: will record # how many samples from each class were # used: self.class_coverage = {} # Sanity records: will record number # of samples of each class that are used # during training and validation: label_distrib = {} batch_num = 0 self.log.info( f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}" ) try: for batch, targets in self.train_loader: # Update the sanity check # num of batches seen, and distribution # of samples across classes: batch_num += 1 total_batch_num += 1 # Update sanity check records: for lbl in targets: lbl = int(lbl) try: label_distrib[lbl] += 1 except KeyError: label_distrib[lbl] = 1 try: self.class_coverage[lbl]['train'] += 1 except KeyError: self.class_coverage[lbl] = { 'train': 1, 'val': 0 } self.log.debug( f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Remember the last batch's train result of this # split (results for earlier batches of # the same split will be overwritten). This statement # must sit before deleting output and labels: step_num = self.step_number(epoch, split_num, self.num_folds) self.remember_results(LearningPhase.TRAINING, step_num, outputs, labels, loss) self.log.debug( f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') del images del outputs del labels del loss torch.cuda.empty_cache() self.log.debug( f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) except EndOfSplit: end_time = datetime.datetime.now() train_time_duration = end_time - epoch_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str( train_time_duration, granularity=4) self.log.info( f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})" ) #*********** #print(f"****** num_batches in split: {batch_num}" ) #print(f"****** LblDist: {label_distrib}") #*********** self.validate_split(step_num) self.visualize_step(step_num) # Save model, keeping self.model_archive_size models: self.model_archive.save_model(self.model, epoch) self.log.debug( f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) # Next Epoch continue end_time = datetime.datetime.now() train_time_duration = end_time - split_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(train_time_duration, granularity=4) self.log.info( f"Done training split {split_num} (duration: {duration_str})") # Next split continue end_time = datetime.datetime.now() epoch_duration = end_time - epoch_start_time epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4) cumulative_dur = end_time - overall_start_time cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4) msg = f"Done epoch {epoch} (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})" self.log.info(msg) #******self.scheduler.step() # Fresh results tallying #self.results.clear() self.log.info( f"Training complete after {self.train_loader.num_folds} splits") # Report the sanity checks: self.log.info(f"Total batches processed: {total_batch_num}") for cid in self.class_coverage.keys(): train_use, val_use = self.class_coverage[cid].items() self.log.info( f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}" ) # All seems to have gone well. Report the # overall result of the final epoch for the # hparms config used in this process: self.report_hparams_summary(self.latest_result) # The final epoch number: return epoch #------------------------------------ # validate_split #------------------- def validate_split(self, step): ''' Validate one split, using that split's validation fold. Return time taken. Record results for tensorboard and other record keeping. :param step: current combination of epoch and split :type step: int :return: number of epoch seconds needed for the validation :rtype: int ''' # Validation self.log.debug( f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) start_time = datetime.datetime.now() self.log.info(f"Starting validation for step {step}") self.model.eval() with torch.no_grad(): for img_tensor, target in self.train_loader.validation_samples(): expanded_img_tensor = unsqueeze(img_tensor, dim=0) expanded_target = unsqueeze(target, dim=0) # Update sanity record: self.class_coverage[int(target)]['val'] += 1 images = FileUtils.to_device(expanded_img_tensor, 'gpu') label = FileUtils.to_device(expanded_target, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, label) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') label = FileUtils.to_device(label, 'cpu') loss = FileUtils.to_device(loss, 'cpu') self.remember_results(LearningPhase.VALIDATING, step, outputs, label, loss) del images del outputs del label del loss torch.cuda.empty_cache() end_time = datetime.datetime.now() val_time_duration = end_time - start_time # A human readable duration st down to minues: duration_str = FileUtils.time_delta_str(val_time_duration, granularity=4) self.log.info(f"Done validation (duration: {duration_str})") return val_time_duration # ------------- Utils ----------- #------------------------------------ # report_acc_loss #------------------- def report_acc_loss(self, phase, epoch, accumulated_loss): self.writer.add_scalar(f"loss/{phase}", accumulated_loss, epoch) #------------------------------------ # remember_results #------------------- def remember_results( self, phase, step, outputs, labels, loss, ): # Add the results tally = ResultTally(step, phase, outputs, labels, loss, self.num_classes, self.batch_size) # Add result to intermediate results collection of # tallies: self.results[step] = tally # Same with the session-wide # collection: self.step_results.add(tally) #------------------------------------ # visualize_step #------------------- def visualize_step(self, step): ''' Take the ResultTally instances in the train and val ResultCollections in self.results, and report appropriate aggregates to tensorboard. Computes f1 scores, accuracies, etc. for given step. Separately for train and validation results: build one long array of predictions, and a corresponding array of labels. Also, average the loss across all instances. The preds and labels as rows to csv files. ''' val_tally = self.results[(step, str(LearningPhase.VALIDATING))] train_tally = self.results[(step, str(LearningPhase.TRAINING))] result_coll = ResultCollection() result_coll.add(val_tally, step) result_coll.add(train_tally, step) self.latest_result = {'train': train_tally, 'val': val_tally} # If we are to write preds and labels to # .csv for later additional processing: if self.csv_writer is not None: self.csv_writer.writerow([ step, train_tally.preds, train_tally.labels, val_tally.preds, val_tally.labels ]) TensorBoardPlotter.visualize_step( result_coll, self.writer, [LearningPhase.TRAINING, LearningPhase.VALIDATING], step, self.class_names) # History of learning rate adjustments: lr_this_step = self.optimizer.param_groups[0]['lr'] self.writer.add_scalar('learning_rate', lr_this_step, global_step=step) #------------------------------------ # visualize_final_epoch_results #------------------- def visualize_final_epoch_results(self, epoch): ''' Reports to tensorboard just for the final epoch. Expect self.latest_result to be the latest ResultTally. ''' # DISPLAY_HISTORY_LEN holds the number # of historic epochs we will show. Two # results per epochs --> need # 2*DISPLAY_HISTORY_LEN results. But check # that there are that many, and show fewer # if needed: num_res_to_show = min(len(self.step_results), 2 * self.DISPLAY_HISTORY_LEN) f1_hist = self.step_results[-num_res_to_show:] # First: the table of train and val f1-macro # scores for the past few epochs: # # |phase|ep0 |ep1 |ep2 | # |-----|-----|----|----| # |train| f1_0|f1_1|f1_2| # | val| f1_0|f1_1|f1_2| f1_macro_tbl = TensorBoardPlotter.make_f1_train_val_table(f1_hist) self.writer.add_text('f1/history', f1_macro_tbl) # Now, in the same tensorboard row: the # per_class train/val f1 scores for each # class separately: # # |class|weighted mean f1 train|weighted mean f1 val| # |-----|----------------------|--------------------| # | c1 |0.1 |0.6 | # | c2 |0.1 |0.6 | # | c3 |0.1 |0.6 | # ------|----------------------|--------------------| f1_all_classes = TensorBoardPlotter.make_all_classes_f1_table( self.latest_result, self.class_names) self.writer.add_text('f1/per-class', f1_all_classes) #------------------------------------ # report_hparams_summary #------------------- def report_hparams_summary(self, latest_result): ''' Called at the end of training. Constructs a summary to report for the hyperparameters used in this process. Reports to the tensorboard. Hyperparameters reported: o lr o optimizer o batch_size o kernel_size Included in the measures are: o balanced_accuracy (train and val) o mean_accuracy_train (train and val) o epoch_prec_weighted o epoch_recall_weighted o epoch_mean_loss (train and val) :param latest_result: dict with keys 'train' and 'val', holding the respective most recent (i.e. last-epoch) ResultTally :type latest_result: {'train' : ResultTally, 'val' : ResultTally } ''' # Get the latest validation tally: train_tally = latest_result['train'] val_tally = latest_result['val'] hparms_vals = OrderedDict({ 'net': self.net_name, 'pretrained': f"{self.pretrained}", 'lr_initial': self.config.Training.lr, 'optimizer': self.config.Training.opt_name, 'batch_size': self.config.getint('Training', 'batch_size'), 'kernel_size': self.config.getint('Training', 'kernel_size'), 'to_grayscale': self.to_grayscale }) metric_results = { 'zz_balanced_adj_acc_train': train_tally.balanced_acc, 'zz_balanced_adj_acc_val': val_tally.balanced_acc, 'zz_acc_train': train_tally.accuracy, 'zz_acc_val': val_tally.accuracy, 'zz_epoch_weighted_prec': val_tally.prec_weighted, 'zz_epoch_weighted_recall': val_tally.recall_weighted, 'zz_epoch_mean_loss_train': train_tally.mean_loss, 'zz_epoch_mean_loss_val': val_tally.mean_loss } self.writer.add_hparams(hparms_vals, metric_results) #------------------------------------ # get_dataloader #------------------- def get_dataloader(self, sample_width, sample_height, perc_data_to_use=None): ''' Returns a cross validating dataloader. If perc_data_to_use is None, all samples under self.root_train_test_data will be used for training. Else percentage indicates the percentage of those samples to use. The selection is random. :param sample_width: pixel width of returned images :type sample_width: int :param sample_height: pixel height of returned images :type sample_height: int :param perc_data_to_use: amount of available training data to use. :type perc_data_to_use: {None | int | float} :return: a data loader that serves batches of images and their assiated labels :rtype: CrossValidatingDataLoader ''' data_root = self.root_train_test_data train_dataset = SingleRootImageDataset(data_root, sample_width=sample_width, sample_height=sample_height, percentage=perc_data_to_use, to_grayscale=True) sampler = SKFSampler(train_dataset, num_folds=self.num_folds, seed=42, shuffle=True, drop_last=True) train_loader = CrossValidatingDataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, sampler=sampler, num_folds=self.num_folds) return train_loader #------------------------------------ # initialize_model #------------------- def initialize_model(self): self.model = NetUtils.get_net(self.net_name, num_classes=self.num_classes, pretrained=self.pretrained, freeze=self.freeze, to_grayscale=self.to_grayscale) self.log.debug( f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) FileUtils.to_device(self.model, 'gpu') self.log.debug( f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) self.opt_name = self.config.Training.get('optimizer', 'Adam') # Default self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr) self.loss_fn = nn.CrossEntropyLoss() self.scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, self.min_epochs) #------------------------------------ # find_num_classes #------------------- def find_num_classes(self, data_root): ''' Expect two subdirectories under data_root: train and validation. Underneath each are further subdirectories whose names are the classes: train validation class1 class2 class3 class1 class2 class3 imgs imgs imgs imgs imgs imgs No error checking to confirm this structure :param data_root: path to parent of train/validation :type data_root: str :return: number of unique classes as obtained from the directory names :rtype: int ''' self.classes = FileUtils.find_class_names(data_root) return len(self.classes) #------------------------------------ # setup_tensorboard #------------------- def setup_tensorboard(self, logdir, raw_data_dir=True): ''' Initialize tensorboard. To easily compare experiments, use runs/exp1, runs/exp2, etc. Method creates the dir if needed. Additionally, sets self.csv_pred_writer and self.csv_label_writer to None, or open CSV writers, depending on the value of raw_data_dir, see create_csv_writer() :param logdir: root for tensorboard events :type logdir: str ''' if not os.path.isdir(logdir): os.makedirs(logdir) # For storing train/val preds/labels # for every epoch. Used to create charts # after run is finished: self.csv_writer = self.create_csv_writer(raw_data_dir) # Place to store intermediate models: self.model_archive = \ self.create_model_archive(self.config, self.num_classes ) # Use SummaryWriterPlus to avoid confusing # directory creations when calling add_hparams() # on the writer: self.writer = SummaryWriterPlus(log_dir=logdir) # Intermediate storage for train and val results: self.results = ResultCollection() self.log.info( f"To view tensorboard charts: in shell: tensorboard --logdir {logdir}; then browser: localhost:6006" ) #------------------------------------ # create_csv_writer #------------------- def create_csv_writer(self, raw_data_dir): ''' Create a csv_writer that will fill a csv file during training/validation as follows: epoch train_preds train_labels val_preds val_labels Cols after the integer 'epoch' col will each be an array of ints: train_preds train_lbls val_preds val_lbls 2,"[2,5,1,2,3]","[2,6,1,2,1]","[1,2]", "[1,3]" If raw_data_dir is provided as a str, it is taken as the directory where csv file with predictions and labels are to be written. The dir is created if necessary. If the arg is instead set to True, a dir 'runs_raw_results' is created under this script's directory if it does not exist. Then a subdirectory is created for this run, using the hparam settings to build a file name. The dir is created if needed. Result ex.: <script_dir> runs_raw_results Run_lr_0.001_br_32 run_2021_05_ ... _lr_0.001_br_32.csv Then file name is created, again from the run hparam settings. If this file exists, user is asked whether to remove or append. The inst var self.csv_writer is initialized to: o None if csv file exists, but is not to be overwritten nor appended-to o A filed descriptor for a file open for either 'write' or 'append. :param raw_data_dir: If simply True, create dir and file names from hparams, and create as needed. If a string, it is assumed to be the directory where a .csv file is to be created. If None, self.csv_writer is set to None. :type raw_data_dir: {None | True | str| :return: CSV writer ready for action. Set either to write a fresh file, or append to an existing file. Unless file exists, and user decided not to overwrite :rtype: {None | csv.writer} ''' # Ensure the csv file root dir exists if # we'll do a csv dir and run-file below it: if type(raw_data_dir) == str: raw_data_root = raw_data_dir else: raw_data_root = os.path.join(self.curr_dir, 'runs_raw_results') if not os.path.exists(raw_data_root): os.mkdir(raw_data_root) # Can rely on raw_data_root being defined and existing: if raw_data_dir is None: return None # Create both a raw dir sub-directory and a .csv file # for this run: csv_subdir_name = FileUtils.construct_filename(self.config.Training, prefix='Run', incl_date=True) os.makedirs(csv_subdir_name) # Create a csv file name: csv_file_nm = FileUtils.construct_filename(self.config.Training, prefix='run', suffix='.csv', incl_date=True) csv_path = os.path.join(raw_data_root, csv_file_nm) # Get csv_raw_fd appropriately: if os.path.exists(csv_path): do_overwrite = FileUtils.user_confirm( f"File {csv_path} exists; overwrite?", default='N') if not do_overwrite: do_append = FileUtils.user_confirm(f"Append instead?", default='N') if not do_append: return None else: mode = 'a' else: mode = 'w' csv_writer = CSVWriterCloseable(csv_path, mode=mode, delimiter=',') header = [ 'epoch', 'train_preds', 'train_labels', 'val_preds', 'val_labels' ] csv_writer.writerow(header) return csv_writer #------------------------------------ # create_model_archive #------------------- def create_model_archive(self, config, num_classes): ''' Creates facility for saving partially trained models along the way. :param config: :type config: :param num_classes: :type num_classes: :return: ModelArchive instance ready for calls to save_model() :rtype: ModelArchive ''' model_archive = ModelArchive(config, num_classes, history_len=self.MODEL_ARCHIVE_SIZE, log=self.log) return model_archive #------------------------------------ # close_tensorboard #------------------- def close_tensorboard(self): if self.csv_writer is not None: try: self.csv_writer.close() except Exception as e: self.log.warn(f"Could not close csv file: {repr(e)}") try: self.writer.close() except AttributeError: self.log.warn( "Method close_tensorboard() called before setup_tensorboard()?" ) except Exception as e: raise RuntimeError( f"Problem closing tensorboard: {repr(e)}") from e #------------------------------------ # get_optimizer #------------------- def get_optimizer(self, optimizer_name, model, lr): optimizer_name = optimizer_name.lower() if optimizer_name == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr, eps=1e-3, amsgrad=True) return optimizer if optimizer_name == 'sgd': optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9) return optimizer if optimizer_name == 'rmsprop': optimizer = optim.RMSprop(model.parameters(), lr=lr, momentum=0.9) return optimizer raise ValueError(f"Optimizer {optimizer_name} not supported") #------------------------------------ # initialize_config_struct #------------------- def initialize_config_struct(self, config_info): ''' Initialize a config dict of dict with the application's configurations. Sections will be: config['Paths'] -> dict[attr : val] config['Training'] -> dict[attr : val] config['Parallelism'] -> dict[attr : val] The config read method will handle config_info being None. If config_info is a string, it is assumed either to be a file containing the configuration, or a JSON string that defines the config. Else config_info is assumed to be a NeuralNetConfig. The latter is relevant only if using this file as a library, rather than a command line tool. If given a NeuralNetConfig instance, it is returned unchanged. :param config_info: the information needed to construct the structure :type config_info: {NeuralNetConfig | str} :return a NeuralNetConfig instance with all parms initialized :rtype NeuralNetConfig ''' if isinstance(config_info, str): # Is it a JSON str? Should have a better test! if config_info.startswith('{'): # JSON String: config = NeuralNetConfig.from_json(config_info) else: config = self.read_configuration(config_info) elif isinstance(config_info, NeuralNetConfig): config = config_info else: msg = f"Error: must have a config file, not {config_info}. See config.cfg.Example in project root" # Since logdir may be in config, need to use print here: print(msg) raise ConfigError(msg) return config #------------------------------------ # read_configuration #------------------- def read_configuration(self, conf_file): ''' Parses config file that describes training parameters, various file paths, and how many GPUs different machines have. Syntax follows Python's configfile package, which includes sections, and attr/val pairs in each section. Expected sections: o Paths: various file paths for the application o Training: holds batch sizes, number of epochs, etc. o Parallelism: holds number of GPUs on different machines For Parallelism, expect entries like: foo.bar.com = 4 127.0.0.1 = 5 localhost = 3 172.12.145.1 = 6 Method identifies which of the entries is 'localhost' by comparing against local hostname. Though 'localhost' or '127.0.0.1' may be provided. Returns a dict of dicts: config[section-names][attr-names-within-section] Types of standard entries, such as epochs, batch_size, etc. are coerced, so that, e.g. config['Training']['epochs'] will be an int. Clients may add non-standard entries. For those the client must convert values from string (the type in which values are stored by default) to the required type. This can be done the usual way: int(...), or using one of the configparser's retrieval methods getboolean(), getint(), and getfloat(): config['Training'].getfloat('learning_rate') :param other_gpu_config_file: path to configuration file :type other_gpu_config_file: str :return: a dict of dicts mirroring the config file sections/entries :rtype: dict[dict] :raises ValueErr :raises TypeError ''' if conf_file is None: return self.init_defaults() config = DottableConfigParser(conf_file) if len(config.sections()) == 0: # Config file exists, but empty: return (self.init_defaults(config)) # Do type conversion also in other entries that # are standard: types = { 'epochs': int, 'batch_size': int, 'kernel_size': int, 'sample_width': int, 'sample_height': int, 'seed': int, 'pytorch_comm_port': int, 'num_pretrained_layers': int, 'root_train_test_data': str, 'net_name': str, } for section in config.sections(): for attr_name in config[section].keys(): try: str_val = config[section][attr_name] required_type = types[attr_name] config[section][attr_name] = required_type(str_val) except KeyError: # Current attribute is not standard; # users of the corresponding value need # to do their own type conversion when # accessing this configuration entry: continue except TypeError: raise ValueError( f"Config file error: {section}.{attr_name} should be convertible to {required_type}" ) return config #------------------------------------ # set_seed #------------------- def set_seed(self, seed): ''' Set the seed across all different necessary platforms to allow for comparison of different models and runs :param seed: random seed to set for all random num generators :type seed: int ''' torch.manual_seed(seed) cuda.manual_seed_all(seed) # Not totally sure what these two do! torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) #------------------------------------ # time_delta_str #------------------- def time_delta_str(self, epoch_delta, granularity=2): ''' Takes the difference between two datetime times: start_time = datetime.datetime.now() <some time elapses> end_time = datetime.datetime.now() delta = end_time - start_time time_delta_str(delta Depending on granularity, returns a string like: Granularity: 1 '160.0 weeks' 2 '160.0 weeks, 4.0 days' 3 '160.0 weeks, 4.0 days, 6.0 hours' 4 '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes' 5 '160.0 weeks, 4.0 days, 6.0 hours, 42.0 minutes, 13.0 seconds' For smaller time deltas, such as 10 seconds, does not include leading zero times. For any granularity: '10.0 seconds' If duration is less than second, returns '< 1sec>' :param epoch_delta: :type epoch_delta: :param granularity: :type granularity: ''' intervals = ( ('weeks', 604800), # 60 * 60 * 24 * 7 ('days', 86400), # 60 * 60 * 24 ('hours', 3600), # 60 * 60 ('minutes', 60), ('seconds', 1), ) secs = epoch_delta.total_seconds() result = [] for name, count in intervals: value = secs // count if value: secs -= value * count if value == 1: name = name.rstrip('s') result.append("{} {}".format(value, name)) dur_str = ', '.join(result[:granularity]) if len(dur_str) == 0: dur_str = '< 1sec>' return dur_str #------------------------------------ # step_number #------------------- def step_number(self, epoch, split_num, num_folds): ''' Combines an epoch with a split number into a single integer series as epochs increase, and split_num cycles from 0 to num_folds. :param epoch: epoch to encode :type epoch: int :param split_num: split number to encode :type split_num: int :param num_folds: number of folds for CV splitting must be contant! :type num_folds: int :return: an integer the combines epoch and split-num :rtype: int ''' step_num = epoch * num_folds + split_num return step_num #------------------------------------ # cleanup #------------------- def cleanup(self): ''' Recover resources taken by collaborating processes. OK to call multiple times. ''' # self.clear_gpu() try: self.writer.close() except Exception as e: self.log.err(f"Could not close tensorboard writer: {repr(e)}")
class SpectrogramChopper: ''' Processes directories of .wav or .mp3 files, chopping them into window_len seconds snippets. Each audio snippet is saved, and spectrograms are created for each. Assumes: self.in_dir Species1 Species2 ... Speciesn smpl1_1.mp3 smpl2_1.mp3 smpln_1.mp3 smpl1_2.mp3 smpl2_2.mp3 smpln_2mp3 ... Saves the snippets in a new directory. Creates a spectrogram for each snippet, and saves those in a different, new directory. Resulting directories under self.out_dir will be: self.out_dir spectrograms wav-files Because many spectrograms are created, speed requirements call for the use of parallelism. Since each audio file's processing is independent from the others, the multiprocessing library is used as follows. - If command line arg --workers is set to 1, no parallelism is used. - If multiple cores are available, some percentage of of them will be deployed to chopping. Each core runs a separate copy of this file. The percentage is controlled by MAX_PERC_OF_CORES_TO_USE. Method chop_all() is used in the single core scenario. Method chop_from_file_list() is used when multiprocessing. This method is the 'target' in the multiprocessing library's sense. ''' # If multiple cores are available, # only use some percentage of them to # be nice: MAX_PERC_OF_CORES_TO_USE = 50 #------------------------------------ # Constructor #------------------- def __init__(self, in_dir_or_spectro_file, outdir, specific_species=None, overwrite_policy=WhenAlreadyDone.ASK, generate_wav_files=False ): ''' The overwrite_policy is one of the WhenAlreadyDone enum members: ASK, OVERWRITE, SKIP. If ASK, request user's permission for each encountered destination file. SKIP should be used when resuming an interrupted chopping session. Any sound file whose destination spectrogram exists is not processed again. If generate_wav_files is True, a .wav file is created for every window of the source soundfile. Usually not necessary. The window_size is the number of seconds by which a sliding window is moved across the source soundfile before a spectrogram is created. @param in_dir_or_spectro_file: location of soundfile root @type in_dir_or_spectro_file: str @param outdir: root of spectrograms/wav_files to create @type outdir: src @param specific_species: process only a spectific list of species @type specific_species: {None | [str]} @param overwrite_policy: what to do when an output file already exists @type overwrite_policy: WhenAlreadyDone ''' self.in_dir = in_dir_or_spectro_file self.out_dir = outdir self.specific_species = specific_species self.overwrite_policy = overwrite_policy self.generate_wav_files = generate_wav_files self.log = LoggingService() self.num_chopped = 0 # Don't show the annoying deprecation # librosa.display() warnings about renaming # 'basey' to 'base' to match matplotlib: warnings.simplefilter("ignore", category=MatplotlibDeprecationWarning) # Hide the UserWarning: PySoundFile failed. Trying audioread instead. warnings.filterwarnings(action="ignore", message="PySoundFile failed. Trying audioread instead.", category=UserWarning, module='', lineno=0) if self.specific_species is None: self.species_list = os.listdir(self.in_dir) else: self.species_list = self.specific_species # Create directories for new audio snippets # and spectrograms: self.wav_dir_path, self.spectrogram_dir_path = self.create_dest_dirs(self.species_list) # Allow others outside the instance # find the audio snippet destination SpectrogramChopper.wav_dir_path = self.wav_dir_path SpectrogramChopper.spectrogram_dir_path = self.spectrogram_dir_path #------------------------------------ # chop_all #------------------- def chop_all(self): ''' Workhorse: Assuming self.in_dir is root of all species audio samples: self.in_dir Species1 Species2 ... Speciesn smpl1_1.mp3 smpl2_1.mp3 smpln_1.mp3 smpl1_2.mp3 smpl2_2.mp3 smpln_2mp3 ... Chops each .mp3 (or .wav) file into window_len snippets. Saves those snippets in a new directory. Creates a spectrogram for each snippet, and saves those in a different, new directory. Resulting directories under self.out_dir will be: self.out_dir spectrograms wav-files If self.specific_species is None, audio files under all species are chopped. Else, self.specific_species is expected to be a list of species names that correspond to the names of species directories above: Species1, Species2, etc. Returns a 2-tuple: (number of created .wav audio snippet files, number of created .png spectrogram snippet files, ''' for species in self.species_list: audio_files = os.listdir(os.path.join(self.in_dir, species)) num_files = len(audio_files) for i, sample_name in enumerate(audio_files): # Chop one audio file: self.log.info(f"Chopping {species} audio {i}/{num_files}") self.chop_one_audio_file(self.in_dir, species, sample_name, self.out_dir) self.num_chopped += num_files num_spectros = utils.find_in_dir_tree(self.spectrogram_dir_path, pattern='*.png') num_audios = utils.find_in_dir_tree(self.wav_dir_path, pattern='*.wav') return (num_audios, num_spectros) #------------------------------------ # chop_from_file_list #------------------- def chop_from_file_list(self, assignments, return_bool, env=None): ''' Takes a list like: [(s1,f1),(s1,f2),(s4,f3)] where s_n is a species name, and f_m is the basename of an audio file to chop. Example: foobar.mp3 Returns True if all went well, else raises exception. Wrinkle: this method is called under two very different scenarios (S1/S2). S1 is when the process started by the user calls this method. That happens when the command line arg --workers is set to 1, or on a machine where few enough cores are available that only one is used. In that case, env is left at None, and all is as normal. S2 occurs when the initial process (the one started from the command line) starts a new Process. That process normally contains a new environment, i.e. some default value for all the environment variables. In particular, DISPLAY and PYTHONPATH will not be what is needed. The result is that all spectrogram creating methods fail, because they cannot find a graphics backend. In that case kwarg env is set to the environment of the initiating process. At the start of this method this process' default environ is then set to match that of the initiating process. :param assignments: list of species/filename pairs :type assignments: [(str,str)] :param env: if provided, the environment of the parent process. If None, the current env is retained :type env: {str : Any} :param return_bool: :type return_bool: ''' # During multiprocessing this method is # the target, i.e. the entry point for # each child. In that case env will be # the environment of the initiating process. # We adopt that environment for this new, # forked process as well: if env is not None: os.environ = env for species_name, fname in assignments: try: self.chop_one_audio_file(self.in_dir, species_name, fname, self.out_dir ) except Exception as e: return_bool.value = False raise e return_bool.value = True #------------------------------------ # chop_one_audio_file #------------------- def chop_one_audio_file(self, in_dir, species, spectro_fname, out_dir, window_len = 5): """ Generates window_len second sound file snippets and associated spectrograms from sound files of arbitrary length. Performs a time shift on all the wav files in the species directories. The shift is 'rolling' such that no information is lost. :param in_dir: directory of the audio file to chop :type file_name: str :param species: the directory names of the species to modify the wav files of. If species=None, all subdirectories will be processed. :type species: {None | [str]} :param spectro_fname: basefile name of audio file to chop :type spectro_fname: str :param out_dir: root directory under which spectrogram and audio snippets will be saved (in different subdirs) :type out_dir: str """ orig, sample_rate = librosa.load(os.path.join(in_dir, species, spectro_fname)) length = int(librosa.get_duration(orig, sample_rate)) for start_time in range(length - window_len): fpath = Path(spectro_fname) window_name = f"{fpath.stem}_sw-start{str(start_time)}" window_file_name = str(Path.joinpath(fpath.parent, window_name)) outfile_spectro = os.path.join(out_dir, 'spectrograms/', species, f"{window_file_name}.png") outfile_audio = os.path.join(out_dir, 'wav-files', species, f"{window_file_name}.{'wav'}") spectro_done = os.path.exists(outfile_spectro) audio_done = os.path.exists(outfile_audio) if spectro_done and audio_done and WhenAlreadyDone.SKIP: # No brainer no need to even read the audio excerpt: continue if spectro_done and not audio_done and not self.generate_wav_files: continue # Need an audio snippet either for # a spectrogram or wav file: window_audio, sr = librosa.load(os.path.join(in_dir, species, spectro_fname), offset=start_time, duration=window_len) if not spectro_done or (spectro_done and self.overwrite_policy != WhenAlreadyDone.SKIP): SoundProcessor.create_spectrogram(window_audio,sr,outfile_spectro) if self.generate_wav_files: if audio_done and self.overwrite_policy == WhenAlreadyDone.SKIP: continue else: sf.write(outfile_audio, window_audio, sr) #------------------------------------ # create_dest_dirs #------------------- def create_dest_dirs(self, species_list): ''' Creates all directories that will hold new audio snippets and spectrograms for each species. For each directory: if dir exists: o if overwrite_policy is True, wipe the dir o else ask user. If response is Yes, wipe the dir else raise FileExistsError :param species_list: names of species to process :type species_list: [str] :return: top level dirs for audio snippets and spectrograms :rtype: (str) :raise FileExistsError: if a dest dir exists and not allowed to wipe it. ''' # Root dir of the two dirs that will hold new # audio snippet and spectrogram files utils.create_folder(self.out_dir, overwrite_policy=self.overwrite_policy) # Below the rootP spectrogram_dir_path = os.path.join(self.out_dir,'spectrograms/') wav_dir_path = os.path.join(self.out_dir,'wav-files/') if not utils.create_folder(spectrogram_dir_path, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting") if not utils.create_folder(wav_dir_path, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {spectrogram_dir_path} exists; aborting") # One dir each for the audio and spectrogram # snippets of one species: for species in species_list: species_spectros_dir = os.path.join(spectrogram_dir_path, species) if not utils.create_folder(species_spectros_dir, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {species_spectros_dir} exists; aborting") species_audio_dir = os.path.join(wav_dir_path, species) if not utils.create_folder(species_audio_dir, overwrite_policy=self.overwrite_policy): raise FileExistsError(f"Target dir {species_audio_dir} exists; aborting") return(wav_dir_path, spectrogram_dir_path) # -------------------- Class Methods ------------ #------------------------------------ # compute_worker_assignments #------------------- @classmethod def compute_worker_assignments(cls, in_dir, num_workers=None): ''' Given the root directory of a set of directories whose names are species, and which contain recordings by species, return a multi processing worker assignment. Expected: in_dir Species1 Species2 ... Speciesn smpl1_1.mp3 smpl2_1.mp3 smpln_1.mp3 smpl1_2.mp3 smpl2_2.mp3 smpln_2mp3 ... Collects number of recordings available for each species. Creates a list of species name buckets such that all workers asked to process one of the buckets, will have roughly equal amounts of work. Example return: [['Species1', 'Species2], ['Species3', 'Species4', 'Species5']] The caller can then assign the first list to one worker, and the second list to another worker. The number of buckets, and therefore the number of eventual workers may be passed in. If None, 80% of the cores available on the current machine will be assumed. If num_workers is provided and the number is larger than the number of available cores, the number is reduced to the number of cores. Also returned is the number of workers on which the computation is based. This number is always the same as the number of species name lists in the return. But for clarity, the number is returned explicitly. :param in_dir: root of species recordings :type in_dir: str :param num_workers: number of buckets into which to partition :type num_workers: {int | None} :return: list of species name lists, and number of workers. :rtype: ([[int]], int) ''' # Create: # {species : num-recordings} # {species : recordings_dir} # [(species1, fpath1), (species1, fpath2), (species2, fpath3)...] sample_size_distrib = OrderedDict({}) sample_dir_dict = {} species_file_tuples = [] for _dir_name, subdir_list, _file_list in os.walk(in_dir): for species_name in subdir_list: species_recordings_dir = os.path.join(in_dir, species_name) rec_paths = os.listdir(species_recordings_dir) sample_size_distrib[species_name] = len(rec_paths) sample_dir_dict[species_name] = species_recordings_dir species_file_pairs = list(zip([species_name]*len(rec_paths), rec_paths)) species_file_tuples.extend(species_file_pairs) break num_cores = mp.cpu_count() # Use 80% of the cores: if num_workers is None: num_workers = round(num_cores * SpectrogramChopper.MAX_PERC_OF_CORES_TO_USE / 100) elif num_workers > num_cores: # Limit pool size to number of cores: num_workers = num_cores # Create a partitioning into equal sized files, # regardless of species association. assignments = cls.partition_by_recordings(species_file_tuples, num_workers) num_workers_used = len(assignments) return assignments, num_workers_used #------------------------------------ # partition_by_recordings #------------------- @classmethod def partition_by_recordings(cls, species_file_pairs, num_workers): ''' Given a list of species-name/file-path tuples, partition that list into num_workers sublists, such that each list contains roughly the same number of tuples. If the number of species_file_pairs tuples is not divisible by num_workers, the left-over tuples are distributed over the first sublists. :param species_file_pairs: :type species_file_pairs: :param num_workers: :type num_workers: :return partitioning of the species_file_pairs tuples :rtype: [[(str, str)]] ''' # Compute near-equal number of files per worker: num_recordings = len(species_file_pairs) recs_per_worker = int(np.ceil(num_recordings / num_workers)) # Create list of species-file pair lists: # [[(s1,f1), (s1,f2)], [s1,f3,s2:f4], ...] # Each inner list will be handled by one worker: assignments = [] assign_idx = 0 for _worker_idx in range(num_workers): assign_sublist = species_file_pairs[assign_idx:assign_idx+recs_per_worker] assignments.append(assign_sublist) assign_idx += recs_per_worker left_overs = num_recordings % num_workers if left_overs > 0: # Can't have more than num_workers left overs, # meaning can't have more leftovers than # sublists. Distribute the leftovers:= for idx, left_over in enumerate(species_file_pairs[-left_overs:]): assignments[idx].append(left_over) # Remove empty assignments: assignments = [ass for ass in assignments if len(ass) > 0] return assignments #------------------------------------ # run_workers #------------------- @classmethod def run_workers(cls, args, overwrite_policy=WhenAlreadyDone.ASK): ''' Called by main to run the SpectrogramChopper in multiple processes at once. Pajcrtitions the audio files to be processed; runs the chopping while giving visual progress on terminal. Prints success/failure of each worker. Then returns :param args: all arguments provided to argparse :type args: {str : Any} ''' in_dir = args.input # Get a list of lists of species names # to process. The list is computed such # that each worker has roughly the same # number of recordings to chop. We let # the method determine the number of workers # by using 80% of the available cores. (worker_assignments, num_workers) = SpectrogramChopper.compute_worker_assignments( in_dir, num_workers=args.workers) print(f"Distributing workload across {num_workers} workers.") # Assign each list of species to one worker: chopping_jobs = [] for ass_num, assignment in enumerate(worker_assignments): chopper = SpectrogramChopper(in_dir, args.output_dir, overwrite_policy=overwrite_policy ) ret_value_slot = mp.Value("b", False) job = ProcessWithoutWarnings(target=chopper.chop_from_file_list, args=([assignment, ret_value_slot]), name=f"ass# {ass_num}" ) job.ret_val = ret_value_slot chopping_jobs.append(job) print(f"Starting chops for {job.name}") job.start() for job in chopping_jobs: job_done = False while not job_done: # Check for job done with one sec timeout: job.join(1) # Get number of generated snippets: num_chopped_snippets = \ len(utils.find_in_dir_tree(SpectrogramChopper.spectrogram_dir_path)) # Keep printing number of done snippets in the same # terminal line: print(f"Number of audio snippets: {num_chopped_snippets}", end='\r') # If the call to join() timed out if job.exitcode is None: # Job not done: continue res = "OK" if job.ret_val else "Error" # New line after the progress msgs: print("") print(f"Chops of {job.name}/{num_workers}: {res}") job_done = True
class Inferencer: ''' classdocs ''' #------------------------------------ # Constructor #------------------- def __init__(self, model_paths, samples_path, batch_size=1, labels_path=None, gpu_ids=0): ''' Given the path to a trained model, and the path to the root of a set of data, compute predictions. If labels_path is None, the subdirectory names between the samples_path root, and the samples themselves are used as the ground truth labels. By default: run batches of size 1, because we always have drop_last set to True. For small test sets leaving out any data at all isn't good. Caller can still set batch_size higher to gain speed if the testset is very large, so that not inferencing on up to batch_size - 1 samples is OK :param model_paths: :type model_paths: :param samples_path: :type samples_path: :param batch_size: :type batch_size: :param labels_path: :type labels_path: :param gpu_ids: Device number of GPU, in case one is available :type gpu_ids: {int | [int]} ''' self.model_paths = model_paths self.samples_path = samples_path self.labels_path = labels_path self.gpu_ids = gpu_ids if type(gpu_ids) == list else [gpu_ids] if batch_size is not None: self.batch_size = batch_size else: self.batch_size = 1 self.IMG_EXTENSIONS = FileUtils.IMG_EXTENSIONS self.log = LoggingService() self.curr_dir = os.path.dirname(__file__) #------------------------------------ # prep_model_inference #------------------- def prep_model_inference(self, model_path): ''' 1. Parses model_path into its components, and creates a dict: self.model_props, which contains the network type, grayscale or not, whether pretrained, etc. 2. Creates self.csv_writer to write results measures into csv files. The destination file is determined as follows: <script_dir>/runs_raw_inferences/inf_csv_results_<datetime>/<model-props-derived-fname>.csv 3. Creates self.writer(), a tensorboard writer with destination dir: <script_dir>/runs_inferences/inf_results_<datetime> 4. Creates an ImageFolder classed dataset to self.samples_path 5. Creates a shuffling DataLoader 6. Initializes self.num_classes and self.class_names 7. Creates self.model from the passed-in model_path name :param model_path: path to model that will be used for inference by this instance of Inferencer :type model_path: str ''' model_fname = os.path.basename(model_path) # Extract model properties # from the model filename: self.model_props = FileUtils.parse_filename(model_fname) csv_results_root = os.path.join(self.curr_dir, 'runs_raw_inferences') #self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{uuid.uuid4().hex}") ts = FileUtils.file_timestamp() self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{ts}") os.makedirs(self.csv_dir, exist_ok=True) csv_file_nm = FileUtils.construct_filename(self.model_props, prefix='inf', suffix='.csv', incl_date=True) csv_path = os.path.join(self.csv_dir, csv_file_nm) self.csv_writer = CSVWriterCloseable(csv_path) ts = FileUtils.file_timestamp() tensorboard_root = os.path.join(self.curr_dir, 'runs_inferences') tensorboard_dest = os.path.join(tensorboard_root, f"inf_results_{ts}") #f"inf_results_{ts}{uuid.uuid4().hex}") os.makedirs(tensorboard_dest, exist_ok=True) self.writer = SummaryWriterPlus(log_dir=tensorboard_dest) dataset = SingleRootImageDataset( self.samples_path, to_grayscale=self.model_props['to_grayscale']) # Make reproducible: Utils.set_seed(42) #********Utils.set_seed(56) self.loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) self.class_names = dataset.class_names() self.num_classes = len(self.class_names) # Get the right type of model, # Don't bother getting it pretrained, # of freezing it, b/c we will overwrite # the weights: self.model = NetUtils.get_net( self.model_props['net_name'], num_classes=self.num_classes, pretrained=False, freeze=0, to_grayscale=self.model_props['to_grayscale']) self.log.info(f"Tensorboard info written to {tensorboard_dest}") self.log.info(f"Result measurement CSV file(s) written to {csv_path}") #------------------------------------ # __call__ #------------------- def __call__(self, gpu_id_model_path_pair): gpu_id, self.model_path = gpu_id_model_path_pair self.prep_model_inference(self.model_path) self.log.info( f"Begining inference with model {FileUtils.ellipsed_file_path(self.model_path)} on gpu_id {gpu_id}" ) #**************** #return self.run_inference(gpu_to_use=gpu_id) dicts_from_runs = [] for i in range(3): self.curr_dict = {} dicts_from_runs.append(self.curr_dict) self.run_inference(gpu_to_use=gpu_id) print(dicts_from_runs) #**************** #------------------------------------ # go #------------------- def go(self): # Pair models to GPUs; example for # self.gpu_ids == [0,4], and three models: # [(gpu0, model0) (gpu4, model1), (gpu0, model3)] repeats = int(np.ceil(len(self.model_paths) / len(self.gpu_ids))) gpu_model_pairings = list(zip(self.gpu_ids * repeats, self.model_paths)) #************* No parallelism for debugging self(gpu_model_pairings[0]) return #************* END No parallelism for debugging with Pool(len(self.gpu_ids)) as inf_pool: # Run as many inferences in parallel as # there are models to try. The first arg, # (self): means to invoke the __call__() method # on self. result_it = inf_pool.imap(self, gpu_model_pairings, chunksize=len(self.gpu_ids)) results = [res.get() for res in result_it] print(f"******Results: {results}") #------------------------------------ # run_inferencer #------------------- def run_inference(self, gpu_to_use=0): ''' Runs model over dataloader. Along the way: creates ResultTally for each batch, and maintains dict instance variable self.raw_results for later conversion of logits to class IDs under different threshold assumptions. self.raw_results: {'all_outputs' : <arr>, 'all_labels' : <arr> } Returns a ResultCollection with the ResultTally instances of each batch. :param gpu_to_use: which GPU to deploy to (if it is available) :type gpu_to_use: int :return: collection of tallies, one for each batch, or None if something went wrong. :rtype: {None | ResultCollection} ''' # Just in case the loop never runs: batch_num = -1 overall_start_time = datetime.datetime.now() try: try: if torch.cuda.is_available(): self.model.load_state_dict(torch.load(self.model_path)) FileUtils.to_device(self.model, 'gpu', gpu_to_use) else: self.model.load_state_dict( torch.load(self.model_path, map_location=torch.device('cpu'))) except RuntimeError as e: emsg = repr(e) if emsg.find("size mismatch for conv1") > -1: emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?" raise RuntimeError(emsg) from e loss_fn = nn.CrossEntropyLoss() result_coll = ResultCollection() # Save all per-class logits for ability # later to use different thresholds for # conversion to class IDs: all_outputs = [] all_labels = [] self.model.eval() num_test_samples = len(self.loader.dataset) self.log.info( f"Begin inference ({num_test_samples} test samples)...") samples_processed = 0 loop_start_time = overall_start_time with torch.no_grad(): for batch_num, (batch, targets) in enumerate(self.loader): if torch.cuda.is_available(): images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') else: images = batch labels = targets outputs = self.model(images) loss = loss_fn(outputs, labels) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') #********** max_logit = outputs[0].max().item() max_idx = (outputs.squeeze() == max_logit).nonzero( as_tuple=False).item() smpl_id = torch.utils.data.dataloader.sample_id_seq[-1] lbl = labels[0].item() pred_cl = max_idx self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl) #********** # Specify the batch_num in place # of an epoch, which is not applicatble # during testing: tally = ResultTally(batch_num, LearningPhase.TESTING, outputs, labels, loss, self.num_classes, self.batch_size) result_coll.add(tally, step=None) all_outputs.append(outputs) all_labels.append(labels) samples_processed += len(labels) del images del outputs del labels del loss torch.cuda.empty_cache() time_now = datetime.datetime.now() # Sign of life every 6 seconds: if (time_now - loop_start_time).seconds >= 5: self.log.info( f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples" ) loop_start_time = time_now finally: #********* print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}") torch.utils.data.dataloader.sample_id_seq = [] #********* time_now = datetime.datetime.now() test_time_duration = time_now - overall_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(test_time_duration, granularity=4) self.log.info( f"Done with inference: {samples_processed} test samples; {duration_str}" ) # Total number of batches we ran: num_batches = 1 + batch_num # b/c of zero-base # If loader delivered nothing, the loop # never ran; warn, and get out: if num_batches == 0: self.log.warn( f"Dataloader delivered no data from {self.samples_path}") self.close() return None # Var all_outputs is now: # [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0 # tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1 # ... # ] # Make into one tensor: (num_batches, batch_size, num_classes), # unless an exception was raised at some point, # throwing us into this finally clause: if len(all_outputs) == 0: self.log.info( f"No outputs were produced; thus no results to report") return None self.all_outputs_tn = torch.stack(all_outputs) # Be afraid...be very afraid: assert(self.all_outputs_tn.shape == \ torch.Size([num_batches, self.batch_size, self.num_classes]) ) # Var all_labels is now num-batches tensors, # each containing batch_size labels: assert (len(all_labels) == num_batches) # list of single-number tensors. Make # into one tensor: self.all_labels_tn = torch.stack(all_labels) assert(self.all_labels_tn.shape == \ torch.Size([num_batches, self.batch_size]) ) # And equivalently: assert(self.all_labels_tn.shape == \ (self.all_outputs_tn.shape[0], self.all_outputs_tn.shape[1] ) ) self.report_results(result_coll) self.close() return result_coll #------------------------------------ # report_results #------------------- def report_results(self, tally_coll): self._report_textual_results(tally_coll, self.csv_dir) self._report_conf_matrix(tally_coll, show_in_tensorboard=True) self._report_charted_results() #------------------------------------ # _report_conf_matrix #------------------- def _report_conf_matrix(self, tally_coll, show=True, show_in_tensorboard=None): ''' Computes the confusion matrix CM from tally collection. Creates an image from CM, and displays it via matplotlib, if show arg is True. If show_in_tensorboard is a Tensorboard SummaryWriter instance, the figure is posted to tensorboard, no matter the value of the show arg. Returns the Figure object. :param tally_coll: all ResultTally instances to be included in the confusion matrix :type tally_coll: result_tallying.ResultCollection :param show: whether or not to call show() on the confusion matrix figure, or only return the Figure instance :type show: bool :param show_in_tensorboard: whether or not to post the image to tensorboard :type show_in_tensorboard: bool :return: Figure instance containing confusion matrix heatmap with color legend. :rtype: matplotlib.pyplot.Figure ''' all_preds = [] all_labels = [] for tally in tally_coll.tallies(phase=LearningPhase.TESTING): all_preds.extend(tally.preds) all_labels.extend(tally.labels) conf_matrix = Charter.compute_confusion_matrix(all_labels, all_preds, self.class_names, normalize=True) # Normalization in compute_confusion_matrix() is # to 0-1. Turn those values into percentages: conf_matrix_perc = (100 * conf_matrix).astype(int) # Decide whether or not to write # confusion cell values into the cells. # The decision depends on how many species # are represented in the conf matrix; too many, # and having numbers in all cells is too cluttered: if len(self.class_names ) > CELL_LABELING.CONF_MATRIX_CELL_LABEL_LIMIT.value: write_in_fields = CELL_LABELING.DIAGONAL else: write_in_fields = CELL_LABELING.ALWAYS fig = Charter.fig_from_conf_matrix( conf_matrix_perc, supertitle='Confusion Matrix\n', subtitle='Normalized to percentages', write_in_fields=write_in_fields) if show_in_tensorboard: self.writer.add_figure('Inference Confusion Matrix', fig, global_step=0) if show: # Something above makes fig lose its # canvas manager. Add that back in: Utils.add_pyplot_manager_to_fig(fig) fig.show() return fig #------------------------------------ # _report_charted_results #------------------- def _report_charted_results(self, thresholds=None): ''' Computes and (pyplot-)shows a set of precision-recall curves in one plot. If precision and/or recall are undefined (b/c of division by zero) for all curves, then returns False, else True. If no curves are defined, logs a warning. :param thresholds: list of cutoff thresholds for turning logits into class ID predictions. If None, the default at Charters.compute_multiclass_pr_curves() is used. :type thresholds: [float] :return: True if curves were computed and show. Else False :rtype: bool ''' # Obtain a dict of CurveSpecification instances, # one for each class, plus the mean Average Precision # across all curves. The dict will be keyed # by class ID: (all_curves_info, mAP) = \ Charter.compute_multiclass_pr_curves( self.all_labels_tn, self.all_outputs_tn, thresholds ) # Separate out the curves without # ill defined prec, rec, or f1: well_defined_curves = list(filter( lambda crv_obj: not(crv_obj['undef_prec'] or\ crv_obj['undef_rec'] or\ crv_obj['undef_f1'] ), all_curves_info.values() ) ) if len(well_defined_curves) == 0: self.log.warn( f"For all thresholds, one or more of precision, recall or f1 are undefined. No p/r curves to show" ) return False # Too many curves are clutter. Only # show the best and worst by optimal f1: f1_sorted = sorted(well_defined_curves, key=lambda obj: obj['best_op_pt']['f1']) curves_to_show = { crv_obj['class_id']: crv_obj for crv_obj in (f1_sorted[0], f1_sorted[-1]) } #********** Mixup with objs blurring together (_num_classes, fig) = \ ClassificationPlotter.chart_pr_curves(curves_to_show) fig.show() return True #------------------------------------ # _report_textual_results #------------------- def _report_textual_results(self, tally_coll, res_dir): ''' Give a sequence of tallies with results from a series of batches, create long outputs, and inputs lists from all tallies Computes information retrieval type values: precision (macro/micro/weighted/by-class) recall (macro/micro/weighted/by-class) f1 (macro/micro/weighted/by-class) acuracy balanced_accuracy Combines these results into a Pandas series, and writes them to a csv file. That file is constructed from the passed-in res_dir, appended with 'ir_results.csv'. Finally, constructs Github flavored tables from the above results, and posts them to the 'text' tab of tensorboard. Returns the results measures Series :param tally_coll: collect of tallies from batches :type tally_coll: ResultCollection :param res_dir: directory where all .csv and other result files are to be written :type res_dir: str :return results of information retrieval-like measures :rtype: pandas.Series ''' all_preds = [] all_labels = [] for tally in tally_coll.tallies(phase=LearningPhase.TESTING): all_preds.extend(tally.preds) all_labels.extend(tally.labels) res = OrderedDict({}) res['prec_macro'] = precision_score(all_labels, all_preds, average='macro', zero_division=0) res['prec_micro'] = precision_score(all_labels, all_preds, average='micro', zero_division=0) res['prec_weighted'] = precision_score(all_labels, all_preds, average='weighted', zero_division=0) res['prec_by_class'] = precision_score(all_labels, all_preds, average=None, zero_division=0) res['recall_macro'] = recall_score(all_labels, all_preds, average='macro', zero_division=0) res['recall_micro'] = recall_score(all_labels, all_preds, average='micro', zero_division=0) res['recall_weighted'] = recall_score(all_labels, all_preds, average='weighted', zero_division=0) res['recall_by_class'] = recall_score(all_labels, all_preds, average=None, zero_division=0) res['f1_macro'] = f1_score(all_labels, all_preds, average='macro', zero_division=0) res['f1_micro'] = f1_score(all_labels, all_preds, average='micro', zero_division=0) res['f1_weighted'] = f1_score(all_labels, all_preds, average='weighted', zero_division=0) res['f1_by_class'] = f1_score(all_labels, all_preds, average=None, zero_division=0) res['accuracy'] = accuracy_score(all_labels, all_preds) res['balanced_accuracy'] = balanced_accuracy_score( all_labels, all_preds) res_series = pd.Series(list(res.values()), index=list(res.keys())) # Write information retrieval type results # to a one-line .csv file, using pandas Series # as convenient intermediary: res_csv_path = os.path.join(res_dir, 'ir_results.csv') res_series.to_csv(res_csv_path) res_rnd = {} for meas_nm, meas_val in res.items(): # Measure results are either floats (precision, recall, etc.), # or np arrays (e.g. precision-per-class). For both # cases, round each measure to one digit: res_rnd[meas_nm] = round(meas_val,1) if type(meas_val) == float \ else meas_val.round(1) ir_measures_skel = { 'col_header': ['precision', 'recall', 'f1'], 'row_labels': ['macro', 'micro', 'weighted'], 'rows': [[ res_rnd['prec_macro'], res_rnd['recall_macro'], res_rnd['f1_macro'] ], [ res_rnd['prec_micro'], res_rnd['recall_micro'], res_rnd['f1_micro'] ], [ res_rnd['prec_weighted'], res_rnd['recall_weighted'], res_rnd['f1_weighted'] ]] } ir_per_class_rows = [[ prec_class, recall_class, f1_class ] for prec_class, recall_class, f1_class in zip( res_rnd['prec_by_class'], res_rnd['recall_by_class'], res_rnd['f1_by_class'])] ir_by_class_skel = { 'col_header': ['precision', 'recall', 'f1'], 'row_labels': self.class_names, 'rows': ir_per_class_rows } accuracy_skel = { 'col_header': ['accuracy', 'balanced_accuracy'], 'row_labels': ['Overall'], 'rows': [[res_rnd['accuracy'], res_rnd['balanced_accuracy']]] } ir_measures_tbl = GithubTableMaker.make_table(ir_measures_skel, sep_lines=False) ir_by_class_tbl = GithubTableMaker.make_table(ir_by_class_skel, sep_lines=False) accuracy_tbl = GithubTableMaker.make_table(accuracy_skel, sep_lines=False) # Write the markup tables to Tensorboard: self.writer.add_text('Information retrieval measures', ir_measures_tbl, global_step=0) self.writer.add_text('Per class measures', ir_by_class_tbl, global_step=0) self.writer.add_text('Accuracy', accuracy_tbl, global_step=0) return res_series #------------------------------------ # close #------------------- def close(self): try: self.writer.close() except Exception as e: self.log.err(f"Could not close tensorboard writer: {repr(e)}")
class TrainScriptRunner(object): ''' classdocs ''' #------------------------------------ # Constructor #------------------- def __init__(self, starting_config_src, hparms_spec, training_script=None, logfile=None, quiet=False, dryrun=False, unittesting=False): ''' Specifications expected like this *Ordered* dict (i.e. sequence of keys and values always the same for keys()/values()/items() methods: {<hparm1> : [val1_1, val1_2, ...], <hparm2> : [val2_1, val2_2, ...] } :param starting_config_src: a configuration whose neural net related parameters will be modified below for each run. :type starting_config_src: {str | NeuralNetConfig} :param hparms_spec: :type hparms_spec: :param training_script: path to the training script of which to run multiple copies. If None, will look in config for Path:train_script. :type training_script: {None | str} :param logfile: where to log runtime information. If None, log to console :type logfile: {None | str} :param quiet: whether or not to report progress :type quiet: bool :param unittesting: set to True if unittesting so that __init__() will only do a minimum, and allows unittests to call other methods individually :type bool ''' if logfile is not None: self.log = LoggingService(logfile=logfile) else: self.log = LoggingService() self.quiet = quiet self.curr_dir = os.path.dirname(__file__) self.hostname = socket.getfqdn() # No GPUs identified so far: self.WORLD_SIZE = 0 starting_config = NeuralNetConfig(starting_config_src) if unittesting: # Leave calling of the methods below # to the unittests return self.training_script = training_script if training_script is None: # Try to find it in config: try: self.training_script = starting_config.getpath( 'Paths', 'train_script', relative_to=self.curr_dir) except KeyError: raise ValueError( "Did not provide training script path on cmd line or in config" ) self.gpu_landscape = self.obtain_world_map(starting_config) # Get list of dicts of hparm-name/hparm_value pairs; # one for each of the runs the_run_dicts = self.get_runs_hparm_specs(hparms_spec) # Turn the run dicts into configurations # that that modify the starting config: the_run_configs = self.gen_configurations(starting_config, the_run_dicts) if dryrun: print("Dryrun:") print( f"Would run {len(the_run_dicts)} processes with these configs:" ) for configs in the_run_dicts: print(configs) return # Provide support for cnt-c terminating the training # script processes nicely: self.cnt_c_received = False signal.signal(signal.SIGTERM, self.handle_cnt_c) # Start one training script for each configuration: self.run_configurations(the_run_configs) #------------------------------------ # get_runs_hparm_specs #------------------- def get_runs_hparm_specs(self, hparms_spec): ''' Create a list of dicts. Each dict holds the value for each of the hparms for one run. :param hparms_spec: client's dict of {param_name : [val1, val2, ...]} :type hparms_spec: {str : [Any]} :return: list of dicts ''' # Running example: # {'lr' : [0.001], # 'optimizer' : ['Adam','RMSprop','SGD'], # 'batch_size' : [32, 64, 128], # 'kernel_size': [3, 7] # }) # Parameters to vary: parm_names = list(hparms_spec.keys()) # Iterate through list of value combinations: # (0.001, 'Adam', 32, 3) # (0.001, 'Adam', 32, 7) # (0.001, 'Adam', 64, 3) # ... # to get a list of dicts, each with a # unique combination of parameter settings: # # [{'lr': 0.001, # 'optimizer' : 'Adam', # 'batch_size' : 32, # 'kernel_size': 3}, # {'lr': 0.001, # 'optimizer' : 'Adam', # 'batch_size' : 32, # 'kernel_size': 7}, # {...} # ... # ] hparms_permutations = [] for _perm_num, ordered_vals_tuple in enumerate( product(*hparms_spec.values())): # Have something like: # (0.001, 'Adam', 32, 3) # Separate dict for each combo: conf_dict = dict(zip(parm_names, ordered_vals_tuple)) hparms_permutations.append(conf_dict) return hparms_permutations #------------------------------------ # gen_configurations #------------------- def gen_configurations(self, config, config_dicts): ''' Takes a list of dicts, and returns a list of NeuralNetConfig instances. Each dict contains one hyperparameter settings combination that is to be tested. Such as: [{'lr': 0.001, 'optimizer': 'Adam', 'batch_size': 32, 'kernel_size': 3}, {'lr': 0.001, 'optimizer': 'Adam', 'batch_size': 32, 'kernel_size': 7}, {...} ... ] Each return configuration is a copy of the config, modified for the respective hyperparameter settings. All other parts of the config are kept. :param config: a configuration with all settings; only the hyperparameter settings will be modified :type config: NeuralNetConfig :param config_dicts: one dict of hyperparm-name : value for each process to run independently :type config_dicts: [{str : Any}] :return: list of configurations for the classifier script to run :rtype: [NeuralNetConfig] ''' configs = [] for conf_dict in config_dicts: conf_copy = config.copy() for param_name, val in conf_dict.items(): conf_copy.add_neural_net_parm(param_name, val) configs.append(conf_copy) return configs #------------------------------------ # obtain_world_map #------------------- def obtain_world_map(self, initial_config): try: self.world_map_path = initial_config.getpath( 'Paths', 'world_map', relative_to=self.curr_dir) except KeyError: raise RuntimeError( f"Could not find entry for 'world_map' in initial config") self.world_map = self.read_world_map(self.world_map_path) # Ensure that this machine has an # entry in the world_map: try: # Get this machine's info (sub)dict: _my_world_info = self.world_map[self.hostname] except KeyError: raise ConfigError( f"World map file does not contain entry for this machine ({self.hostname})" ) self.compute_landscape = {} gpu_landscape = self.build_compute_landscape(self.world_map) return gpu_landscape #------------------------------------ # build_compute_landscape #------------------- def build_compute_landscape(self, world_map): ''' # Using the world_map.json config file, build # a dict self.gpu_landscape like this: # # {'machine_name1' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # {'machine_name2' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # } # # Also sets # o self.master_hostname, the hostname # running the one process that coordinates all others. # o self.WORLD_SIZE, number of GPUs used across all machines # o self.my_gpus, the number of GPUs on this machine :param world_map: :type world_map: :return: information about how many GPUs are on each node :rtype: OrderedDict ''' if not self.hostname in world_map.keys(): raise ConfigError( f"World map does not contain an entry for this machine {self.hostname}" ) # Go through the world map, machine (a.k.a. node) # one at a time, in alpha order of the machine # names to ensure all copies of this script # come to the same conclusions about ranks # Build gpu_landscape: # # {'machine_name1' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # {'machine_name2' : {'start_rank' : <int>, # 'num_gpus' : <int>, # 'gpu_device_ids': [<int>,<int>,...] # } # # The structure is an OrderedDict(), containing # machines alphabetically by name. This discipline # is required so that all copies of this launch script # (one copy per machine) arrive at the same ordering of # GPUs: gpu_landscape = OrderedDict({}) machine_name = self.hostname machine_info = world_map[self.hostname] try: machine_gpus = machine_info['gpus'] except KeyError: print("World map must include a 'gpus' entry; the value may be 0") gpu_landscape[machine_name] = {} gpu_landscape[machine_name]['num_gpus'] = machine_gpus # List of GPU numbers to use is optional # in world_maps: machine_gpus_to_use = machine_info.get('devices', None) if machine_gpus_to_use is None: # Use all GPUs on this machine: machine_gpus_to_use = list(range(machine_gpus)) gpu_landscape[machine_name]['gpu_device_ids'] = machine_gpus_to_use # Add 1 process for the on this machine, # which will run on its CPU, b/c no GPUs # are available: self.WORLD_SIZE += machine_gpus if machine_gpus > 0 else 1 self.my_gpus = gpu_landscape[self.hostname]['num_gpus'] self.gpu_landscape = gpu_landscape return gpu_landscape #------------------------------------ # read_world_map #------------------- def read_world_map(self, path): ''' Read the JSON5 world map file, and return a corresponding dict. JSON5 allows something like: /* This is a block comment. Notice the lacking quote chars around the keys below. The are optional in JSON5 */ {quintus.stanford.edu : { "master" : Yes "gpus" : 2 }, quatro.stanford.edu : { "gpus" : 2, "devices" : [1,2] } } BUT: JSON5 gets angry at dots in the keys. So we first read the file, and try to find the machine names. We temporarily replace them with an acceptable marker, and then convert back. :param path: path to world map file :type path: string ''' dot_substitute = '___' try: # Read all the world map file lines: with open(path, 'r') as world_map_fd: tmp_world_map = world_map_fd.readlines() except IOError as e: raise IOError(f"World map file at {path} not found") from e # Replace occurrences of '.' with dot_substitute: new_text = [] for line in tmp_world_map: new_text.append(line.replace('.', dot_substitute)) # ... and make one string from all the lines: json_str = '\n'.join(new_text) try: # Hopefully, JSON5 will eat it now: world_map_almost = json5.loads(json_str) except JSONError as e: raise JSONError( f"World map file at {path} contains bad JSON") from e # Need to fix all the dot substitutions. # At this point the data structure is # { <machine_name> : {spec_attr1 : val1, # spec_attr2 : val2, # } # } # Fix the machine names first: mach_names_fixed = [ machine_name.replace(dot_substitute, '.') for machine_name in world_map_almost.keys() ] machine_specs_fixed = [] # Now dig into each of the nested machine spec # dicts, and fix attrs and values there: for spec in world_map_almost.values(): # Spec is a dict nested inside the outer one: spec_fixed = { key.replace(dot_substitute, '.'): val.replace( dot_substitute, '.') if isinstance(val, str) else val for key, val in spec.items() } machine_specs_fixed.append(spec_fixed) # Put it all together: world_map = { machine_name: spec_dict for machine_name, spec_dict in zip(mach_names_fixed, machine_specs_fixed) } return world_map #------------------------------------ # run_configurations #------------------- def run_configurations(self, run_configs): ''' Takes a list of run configuration that specify the details of a training run (lr, optimizer to use, etc.) Spawns independent training script processes, one with each of the configurations. If fewer CPUs/GPUs are available than the number of configs in run_configs, waits for processes to finish, then launches more. Configs may take one of three forms: o File path to a config file o JSON string with all the config info o A NeuralNetConfig instance Use world_map.json to know how many, and which GPUs this machine is to use. Each copy of the training script is told: o RANK # The copy's sequence number, which is # Unique within this machine (but not # currently across machines, as in in # distributed data parallel (DDP) o LOCAL_RANK # Which of this machine's GPU to use (0-origin) o WORLD_SIZE # How many GPUs are used on all machines together o GPUS_USED_THIS_MACHINE # Number of GPUs *used* on this # machine, according to the world_map. # (As opposed to number of GPUs that # exist on this machine.) :param run_configs: list of configurations. Each config may either be a JSON string, the file name of a config file, or a NeuralNetConfig instance :type run_configs: [str | NeuralNetConfig] :return 0 for success of all processes, else 1 :rtype int ''' gpu_ids_to_use = self.gpu_landscape[self.hostname]['gpu_device_ids'] cpu_only = len(gpu_ids_to_use) == 0 self.gpu_manager = GPUManager(gpu_ids_to_use) for config in run_configs: # Get next available GPU ID, waiting # for one to free up, if necessary: local_rank = self.gpu_manager.obtain_gpu() # Create a command that is fit for passing to # Popen; it will start one training script # process. The conditional expression accounts for # machine with no GPU (which will run on CPU): cmd = self.training_script_start_cmd(local_rank, config) # Copy stdin, and give the copy to the subprocess. # This enables the subprocess to ask user whether # to save training state in case of a cnt-C: newstdin = os.fdopen(os.dup(sys.stdin.fileno())) # Spawn one training script. Use psutil's # Popen instead of subprocess.Popen to get # the wait_procs() method on the resulting # process instances: process = psutil.Popen( cmd, stdin=newstdin, stdout=None, # Script inherits this launch stderr=None # ... script's stdout/stderr ) if cpu_only: process.wait() # CPU op is for debugging only; # Rebel right away if something # went wrong: if process.returncode != 0: print("CPU job ran with errors; see log") return continue # Associate process instance with # the configuration it was to run. self.gpu_manager.process_register( RunInfo(local_rank, process, config, cmd)) # Launched all configurations; wait for # the last of them to be done: if cpu_only: print("CPU job(s) ran OK") return # Ask for GPUs until we accounted # for all that we were allowed to # use; that will be indication that # all processes finished: for _i in len(gpu_ids_to_use): self.gpu_manager.obtain_gpu() if not self.quiet: print(f"Node {self.hostname} {os.path.basename(sys.argv[0])}: " \ f"Processed {len(run_configs)} configurations") failed_processes = self.gpu_manager.failures() if len(failed_processes) > 0: print( f"Failures: {len(failed_processes)} (Check log for error entries):" ) for failed_proc in failed_processes: failed_config = self.gpu_manager.process_info(failed_proc) train_script = self.training_script msg = (f"Training script {train_script}: {str(failed_config)}") print(msg) #------------------------------------ # training_script_start_cmd #------------------- def training_script_start_cmd(self, local_rank, config): ''' From provided information, creates a legal command string for starting the training script. :param local_rank: GPU identifier (between 0 and num of GPUs in this machine) :type local_rank: int :param config: additional information in a config instance, or a path to a configuration file :type config: {NeuralNetConfig | str} ''' # Build the shell command line, # starting with 'python -u': cmd = [sys.executable, "-u", f"{self.training_script}"] # Add the 'secret' args that tell the training # script all the communication parameters: cmd.extend([ f"--LOCAL_RANK={local_rank}", f"--WORLD_SIZE={self.WORLD_SIZE}", ]) # Finally, the obligatory non-option arg # to the training script: the configuration. # Could be a file, a json string, or a # NeuralNetConfig instance: if isinstance(config, NeuralNetConfig): # Turn into a JSON str for communicating # to the script: config_arg = config.to_json() self.log.info(f"\nLAUNCHING TRAINING: " +\ f"{NeuralNetConfig.json_human_readable(config_arg)}") else: config_arg = config self.log.info(f"\nLAUNCHING TRAINING from file: {config_arg}") cmd.append(config_arg) #self.log.debug(f"****** Launch: the cmd is {cmd}") return cmd # ------------------- Utils -------------- #------------------------------------ # handle_cnt_c #------------------- def handle_cnt_c(self): ''' Given a list of process instances, Send SIGINT (cnt-C) to them: :param procs: :type procs: ''' if self.cnt_c_received: # Just quit after a second # cnt-c: print( f"Hard quit. May wish to check for stray {self.training_script} processes" ) sys.exit(1) self.cnt_c_received = True for process in self.gpu_manager.process_list(): # If process is no longer running, # forget about it: if process.poll is not None: # Process dead: continue process.send_signal(signal.SIGTERM) process.wait() #------------------------------------ # am_master_node #------------------- def am_master_node(self): ''' This method allows this script to stay somewhat close to the Distributed Data Parallel sibling launch_birds_parallel(). For this script, though, every process is its own master. ''' return True #------------------------------------ # is_json_str #------------------- def is_json_str(self, str_to_check): ''' Very primitive test whether a passed-in string is (legal) JSON or not. :param str_to_check: string to examine :type str_to_check: str :return True/False :rtype bool ''' try: json5.loads(str_to_check) except JSONError: return False return True