def save_updated_snippet(self, outdir, species, snippet_path, spectro_arr, metadata): ''' Create path name: outdir/species/snippet-fname and save the spectro_arr to that path as a .png file with embedded metadata :param outdir: destination directory :type outdir: str :param snippet_path: file name or absolute path to snipet :type snippet_path: src :param spectro_arr: image data :type spectro_arr: np.array :param metadata: auxiliary info to include in the .png file :type metadata: {str : str} ''' # Save the updated snippet_path: species_subdir = os.path.join(outdir, species) snip_outname = os.path.join(species_subdir, os.path.basename(snippet_path)) FileUtils.ensure_directory_existence(snip_outname) SoundProcessor.save_image(spectro_arr, snip_outname, metadata)
def materialize_model(self, model_path, gpu_to_use=0): model_fname = os.path.basename(model_path) # Extract model properties # from the model filename: self.model_props = FileUtils.parse_filename(model_fname) model = NetUtils.get_net( self.model_props['net_name'], num_classes=self.model_props['num_classes'], pretrained=False, freeze=0, to_grayscale=self.model_props['to_grayscale'] ) try: if torch.cuda.is_available(): self.model.load_state_dict(torch.load(self.model_path)) FileUtils.to_device(model, 'gpu', gpu_to_use) else: self.model.load_state_dict(torch.load( model_path, map_location=torch.device('cpu') )) except RuntimeError as e: emsg = repr(e) if emsg.find("size mismatch for conv1") > -1: emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?" raise RuntimeError(emsg) from e return model
def get_dataloader(self, sample_width, sample_height): ''' Returns a train and a validate dataloader ''' IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') data_root = self.root_train_test_data transformation = FileUtils.get_image_transforms(sample_width, sample_height, to_grayscale=False) train_dataset = ImageFolder( os.path.join(data_root, 'train'), transformation, is_valid_file=lambda file: Path(file).suffix in IMG_EXTENSIONS) val_dataset = ImageFolder( os.path.join(data_root, 'validation'), transformation, is_valid_file=lambda file: Path(file).suffix in IMG_EXTENSIONS) train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) return train_loader, val_loader
def sign_of_life(cls, job, num_already_present_imgs, outdir, start_time, force_rewrite=False): # Time for sign of life? now_time = datetime.datetime.now() time_duration = now_time - start_time # Every 3 seconds, but at least 3: if force_rewrite \ or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0): # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(time_duration, granularity=4) # Get current and new spectro imgs in outdir: num_now_present_imgs = len( Utils.find_in_dir_tree(outdir, pattern="*.png")) num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs # Keep printing number of done snippets in the same # terminal line: print((f"{job.name}---Number of spectros: {num_now_present_imgs} " f"({num_newly_present_imgs} new) after {duration_str}"), end='\r') return num_newly_present_imgs else: return num_already_present_imgs
def validate_split(self, step): ''' Validate one split, using that split's validation fold. Return time taken. Record results for tensorboard and other record keeping. :param step: current combination of epoch and split :type step: int :return: number of epoch seconds needed for the validation :rtype: int ''' # Validation self.log.debug( f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) start_time = datetime.datetime.now() self.log.info(f"Starting validation for step {step}") self.model.eval() with torch.no_grad(): for img_tensor, target in self.train_loader.validation_samples(): expanded_img_tensor = unsqueeze(img_tensor, dim=0) expanded_target = unsqueeze(target, dim=0) # Update sanity record: self.class_coverage[int(target)]['val'] += 1 images = FileUtils.to_device(expanded_img_tensor, 'gpu') label = FileUtils.to_device(expanded_target, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, label) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') label = FileUtils.to_device(label, 'cpu') loss = FileUtils.to_device(loss, 'cpu') self.remember_results(LearningPhase.VALIDATING, step, outputs, label, loss) del images del outputs del label del loss torch.cuda.empty_cache() end_time = datetime.datetime.now() val_time_duration = end_time - start_time # A human readable duration st down to minues: duration_str = FileUtils.time_delta_str(val_time_duration, granularity=4) self.log.info(f"Done validation (duration: {duration_str})") return val_time_duration
def test_load_preds_and_labels(self): tally_coll = FileUtils.load_preds_and_labels(self.csv_data_path) # Expect four tallies from the two # rows in the csv file: each row has # a train and a val: self.assertEqual(len(tally_coll), 4) tally = tally_coll[0] self.assertEqual(tally.batch_size, 64) self.assertEqual(str(tally.phase), 'TRAINING')
def test_parse_filename(self): prop_dict = FileUtils.parse_filename(self.csv_data_path) self.assertEqual(prop_dict['timestamp'], '2021-03-11T10_59_02') self.assertEqual(prop_dict['net_name'], 'resnet18') self.assertEqual(prop_dict['pretrained'], True) self.assertEqual(prop_dict['lr'], 0.01) self.assertEqual(prop_dict['opt_name'], 'SGD') self.assertEqual(prop_dict['batch_size'], 64) self.assertEqual(prop_dict['kernel_size'], 7) self.assertEqual(prop_dict['num_folds'], 0) self.assertEqual(prop_dict['num_classes'], 10)
def _instantiate_model(self, run_path_str=None, config=None): ''' Returns a model based on information in the config structure, or the info encoded in the run_path_str file name. One of run_path_str or config must be non-None. If both are non-None, uses config. File paths that encode run parameters look like this horror: model_2021-03-11T10_59_02_net_resnet18_pretrain_0_lr_0.01_opt_SGD_bs_64_ks_7_folds_0_gray_True_classes_10.pth :param run_path_str: a path name associated with a model. :type run_path_str: :param config: run configuration structure :type config: NeuralNetConfig :return: a model :rtype: torch.nn.module ''' if config is None: # Get a dict with info # in a standard (horrible) file name: fname_props = FileUtils.parse_filename(run_path_str) else: fname_props = config.Training data_root = config.Paths.root_train_test_data class_names = FileUtils.find_class_names(data_root) fname_props['classes'] = len(class_names) fname_props['pretrain'] = config.Training.getint('freeze', 0) model = NetUtils.get_net(net_name=fname_props['net_name'], num_classes=fname_props['classes'], freeze=fname_props['pretrain'], to_grayscale=fname_props['to_grayscale'] ) return model
def initialize_model(self): self.model = NetUtils.get_net(self.net_name, num_classes=self.num_classes, pretrained=self.pretrained, freeze=self.freeze, to_grayscale=self.to_grayscale) self.log.debug( f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) FileUtils.to_device(self.model, 'gpu') self.log.debug( f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) self.opt_name = self.config.Training.get('optimizer', 'Adam') # Default self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr) self.loss_fn = nn.CrossEntropyLoss() self.scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, self.min_epochs)
def get_weights(cls, file_root): ''' Given to root of a subdirectory, return a tensor of weights. The order of the weights corresponds to the naturally sorted class names. :param file_root: full path to root of data file subtree :type file_root: str :return weights in naturally sorted class order :rtype: Tensor ''' # Full paths of all the non-dot-starting # dirs under file_root: # OrderedDict{class_name : [Path(dir1), Path(dir2)] # The class names are already sorted: class_name_paths_dir = FileUtils.find_class_paths(file_root) # Create: # {'class1' : <num_samples>, # 'class2' : <num_samples>, # ... # } class_populations = {} for class_name in class_name_paths_dir.keys(): num_samples = 0 # Each class may have samples in multiple # directories; add them up: for class_dir in class_name_paths_dir[class_name]: num_samples += len([file_name for file_name in os.listdir(class_dir) if Path(file_name).suffix in FileUtils.IMG_EXTENSIONS ]) class_populations[class_name] = num_samples if len(class_populations) == 0: LoggingService().err(f"No target classes found under {file_root}") sys.exit(1) majority_class_population = max(class_populations.values()) weights = [] for class_name in class_name_paths_dir.keys(): weights.append(class_populations[class_name] / majority_class_population) return torch.tensor(weights)
def test_ellipsed_file_path(self): # File name too long even without # leading dirs: self.assertEqual( FileUtils.ellipsed_file_path( '/home/yatagait/birds/src/birdsong/recordings/CALL_XC482431-R024 white ruffed manakin.mp3' ), '/home/...CALL_XC482431-R024 white ruffed manakin.mp3') # Same without leading slash self.assertEqual( FileUtils.ellipsed_file_path( 'home/yatagait/birds/src/birdsong/recordings/CALL_XC482431-R024 white ruffed manakin.mp3' ), 'home/...CALL_XC482431-R024 white ruffed manakin.mp3') self.assertEqual(FileUtils.ellipsed_file_path('foobar'), 'foobar') self.assertEqual(FileUtils.ellipsed_file_path('foobar/fum'), 'foobar/fum') # Uneven num of dirs: self.assertEqual( FileUtils.ellipsed_file_path('foobar/bluebell/grayhound/'), 'foobar/.../grayhound') # Even num of dirs self.assertEqual( FileUtils.ellipsed_file_path('blue/foobar/bluebell/grayhound/'), 'blue/.../bluebell/grayhound') # Length just acceptable self.assertEqual( FileUtils.ellipsed_file_path('blue/foobar/grayhound/bar'), 'blue/foobar/grayhound/bar') # Length one over acceptable self.assertEqual( FileUtils.ellipsed_file_path('Bblue/foobar/grayhound/bar'), 'Bblue/.../grayhound/bar') # Absolute path: self.assertEqual( FileUtils.ellipsed_file_path('/Bblue/foobar/grayhound/bar'), '/Bblue/.../grayhound/bar')
def test_make_run_props_dict(self): conf = NeuralNetConfig(self.config_path) training_section = conf.Training # Create the expected result for ground truth: expected_dict = {} for short_name, long_name in FileUtils.fname_short_2_long.items(): try: val = training_section[long_name] expected_dict[short_name] = val except KeyError: # Config file happens not to # have an entry for the long_name: expected_dict[short_name] = 'na' continue prop_dict = FileUtils.make_run_props_dict(training_section) self.assertDictEqual(prop_dict, expected_dict)
def find_num_classes(self, data_root): ''' Expect two subdirectories under data_root: train and validation. Underneath each are further subdirectories whose names are the classes: train validation class1 class2 class3 class1 class2 class3 imgs imgs imgs imgs imgs imgs No error checking to confirm this structure :param data_root: path to parent of train/validation :type data_root: str :return: number of unique classes as obtained from the directory names :rtype: int ''' self.classes = FileUtils.find_class_names(data_root) return len(self.classes)
def test_chart_pr_curves(self): recall_axis = np.array([1, 2, 3, 4, 5, 6]) curve_info = { 1: { 'recalls': recall_axis, 'precisions': 2 * recall_axis, 'avg_prec': 0.6, 'best_op_pt': { 'threshold': 0.6, 'f1': 0.82, 'rec': 2, 'prec': 4, } }, 2: { 'recalls': recall_axis, 'precisions': 0.5 * recall_axis, 'avg_prec': 0.8, } } num_classes, fig = \ ClassificationPlotter.chart_pr_curves(curve_info) self.assertEqual(num_classes, 2) self.assertEqual(len(fig.axes), 1) ax = fig.axes[0] self.assertEqual(ax.get_xlabel(), 'recall') self.assertEqual(ax.get_ylabel(), 'precision') # Allow the fig to show # before asking user to # check it (the pause()) fig.show() plt.pause(0.001) fig_ok = FileUtils.user_confirm(f"Fig should have 2 lines, one point, and a legend\n" +\ f"Looks OK? (Y/n") if not fig_ok: self.fail("PR curve was not correct")
def test_construct_filename(self): props = { 'net_name': 'resnet18', 'min_epochs': '3', 'max_epochs': '6', 'batch_size': '2', 'num_folds': '3', 'seed': '42', 'kernel_size': '7', 'sample_width': '400', 'sample_height': '400', 'lr': '0.01', 'to_grayscale': 'False' } fname = FileUtils.construct_filename(props, prefix='model', suffix='.pth', incl_date=False) expected = 'model_net_resnet18_bs_2_folds_3_ks_7_lr_0.01_gray_False.pth' self.assertEqual(fname, expected)
def create_csv_writer(self, raw_data_dir): ''' Create a csv_writer that will fill a csv file during training/validation as follows: epoch train_preds train_labels val_preds val_labels Cols after the integer 'epoch' col will each be an array of ints: train_preds train_lbls val_preds val_lbls 2,"[2,5,1,2,3]","[2,6,1,2,1]","[1,2]", "[1,3]" If raw_data_dir is provided as a str, it is taken as the directory where csv file with predictions and labels are to be written. The dir is created if necessary. If the arg is instead set to True, a dir 'runs_raw_results' is created under this script's directory if it does not exist. Then a subdirectory is created for this run, using the hparam settings to build a file name. The dir is created if needed. Result ex.: <script_dir> runs_raw_results Run_lr_0.001_br_32 run_2021_05_ ... _lr_0.001_br_32.csv Then file name is created, again from the run hparam settings. If this file exists, user is asked whether to remove or append. The inst var self.csv_writer is initialized to: o None if csv file exists, but is not to be overwritten nor appended-to o A filed descriptor for a file open for either 'write' or 'append. :param raw_data_dir: If simply True, create dir and file names from hparams, and create as needed. If a string, it is assumed to be the directory where a .csv file is to be created. If None, self.csv_writer is set to None. :type raw_data_dir: {None | True | str| :return: CSV writer ready for action. Set either to write a fresh file, or append to an existing file. Unless file exists, and user decided not to overwrite :rtype: {None | csv.writer} ''' # Ensure the csv file root dir exists if # we'll do a csv dir and run-file below it: if type(raw_data_dir) == str: raw_data_root = raw_data_dir else: raw_data_root = os.path.join(self.curr_dir, 'runs_raw_results') if not os.path.exists(raw_data_root): os.mkdir(raw_data_root) # Can rely on raw_data_root being defined and existing: if raw_data_dir is None: return None # Create both a raw dir sub-directory and a .csv file # for this run: csv_subdir_name = FileUtils.construct_filename(self.config.Training, prefix='Run', incl_date=True) os.makedirs(csv_subdir_name) # Create a csv file name: csv_file_nm = FileUtils.construct_filename(self.config.Training, prefix='run', suffix='.csv', incl_date=True) csv_path = os.path.join(raw_data_root, csv_file_nm) # Get csv_raw_fd appropriately: if os.path.exists(csv_path): do_overwrite = FileUtils.user_confirm( f"File {csv_path} exists; overwrite?", default='N') if not do_overwrite: do_append = FileUtils.user_confirm(f"Append instead?", default='N') if not do_append: return None else: mode = 'a' else: mode = 'w' csv_writer = CSVWriterCloseable(csv_path, mode=mode, delimiter=',') header = [ 'epoch', 'train_preds', 'train_labels', 'val_preds', 'val_labels' ] csv_writer.writerow(header) return csv_writer
args = parser.parse_args() if type(args.device) != list: args.device = [args.device] # Expand Unix wildcards, tilde, and env # vars in the model paths: if type(args.model_paths) != list: model_paths_raw = [args.model_paths] else: model_paths_raw = args.model_paths model_paths = [] for fname in model_paths_raw: model_paths.extend(FileUtils.expand_filename(fname)) # Same for samples path, though we only allow # one of those paths. samples_path = FileUtils.expand_filename(args.samples_path)[0] # Ensure that the file arrangements are as required by # the ImageFolder class: # <root_dir> # img_folder_1 img_folder_2 ... img_folder_n # img_file img_file img_file # img_file img_file img_file # ... ... dir_struct_desc = f"Samples must be in *sub*directories with image files under {samples_path}" for root, dirs, _files in os.walk(samples_path):
def train(self): overall_start_time = datetime.datetime.now() # Just for sanity: keep track # of number of batches... total_batch_num = 0 # Note: since we are cross validating, the # data loader's set_epoch() method is only # called once (automatically) during instantiation # of the associated sampler. Moving from split # to split includes shuffling if the caller # specified that. # Training for split_num in range(self.train_loader.num_folds): split_start_time = datetime.datetime.now() self.initialize_model() for epoch in range(self.max_epochs): # Set model to train mode: self.model.train() epoch_start_time = datetime.datetime.now() self.log.info(f"Starting epoch {epoch} training") # Sanity check record: will record # how many samples from each class were # used: self.class_coverage = {} # Sanity records: will record number # of samples of each class that are used # during training and validation: label_distrib = {} batch_num = 0 self.log.info( f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}" ) try: for batch, targets in self.train_loader: # Update the sanity check # num of batches seen, and distribution # of samples across classes: batch_num += 1 total_batch_num += 1 # Update sanity check records: for lbl in targets: lbl = int(lbl) try: label_distrib[lbl] += 1 except KeyError: label_distrib[lbl] = 1 try: self.class_coverage[lbl]['train'] += 1 except KeyError: self.class_coverage[lbl] = { 'train': 1, 'val': 0 } self.log.debug( f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Remember the last batch's train result of this # split (results for earlier batches of # the same split will be overwritten). This statement # must sit before deleting output and labels: step_num = self.step_number(epoch, split_num, self.num_folds) self.remember_results(LearningPhase.TRAINING, step_num, outputs, labels, loss) self.log.debug( f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') del images del outputs del labels del loss torch.cuda.empty_cache() self.log.debug( f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) except EndOfSplit: end_time = datetime.datetime.now() train_time_duration = end_time - epoch_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str( train_time_duration, granularity=4) self.log.info( f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})" ) #*********** #print(f"****** num_batches in split: {batch_num}" ) #print(f"****** LblDist: {label_distrib}") #*********** self.validate_split(step_num) self.visualize_step(step_num) # Save model, keeping self.model_archive_size models: self.model_archive.save_model(self.model, epoch) self.log.debug( f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) # Next Epoch continue end_time = datetime.datetime.now() train_time_duration = end_time - split_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(train_time_duration, granularity=4) self.log.info( f"Done training split {split_num} (duration: {duration_str})") # Next split continue end_time = datetime.datetime.now() epoch_duration = end_time - epoch_start_time epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4) cumulative_dur = end_time - overall_start_time cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4) msg = f"Done epoch {epoch} (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})" self.log.info(msg) #******self.scheduler.step() # Fresh results tallying #self.results.clear() self.log.info( f"Training complete after {self.train_loader.num_folds} splits") # Report the sanity checks: self.log.info(f"Total batches processed: {total_batch_num}") for cid in self.class_coverage.keys(): train_use, val_use = self.class_coverage[cid].items() self.log.info( f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}" ) # All seems to have gone well. Report the # overall result of the final epoch for the # hparms config used in this process: self.report_hparams_summary(self.latest_result) # The final epoch number: return epoch
def prep_model_inference(self, model_path): ''' 1. Parses model_path into its components, and creates a dict: self.model_props, which contains the network type, grayscale or not, whether pretrained, etc. 2. Creates self.csv_writer to write results measures into csv files. The destination file is determined as follows: <script_dir>/runs_raw_inferences/inf_csv_results_<datetime>/<model-props-derived-fname>.csv 3. Creates self.writer(), a tensorboard writer with destination dir: <script_dir>/runs_inferences/inf_results_<datetime> 4. Creates an ImageFolder classed dataset to self.samples_path 5. Creates a shuffling DataLoader 6. Initializes self.num_classes and self.class_names 7. Creates self.model from the passed-in model_path name :param model_path: path to model that will be used for inference by this instance of Inferencer :type model_path: str ''' model_fname = os.path.basename(model_path) # Extract model properties # from the model filename: self.model_props = FileUtils.parse_filename(model_fname) csv_results_root = os.path.join(self.curr_dir, 'runs_raw_inferences') #self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{uuid.uuid4().hex}") ts = FileUtils.file_timestamp() self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{ts}") os.makedirs(self.csv_dir, exist_ok=True) csv_file_nm = FileUtils.construct_filename(self.model_props, prefix='inf', suffix='.csv', incl_date=True) csv_path = os.path.join(self.csv_dir, csv_file_nm) self.csv_writer = CSVWriterCloseable(csv_path) ts = FileUtils.file_timestamp() tensorboard_root = os.path.join(self.curr_dir, 'runs_inferences') tensorboard_dest = os.path.join(tensorboard_root, f"inf_results_{ts}") #f"inf_results_{ts}{uuid.uuid4().hex}") os.makedirs(tensorboard_dest, exist_ok=True) self.writer = SummaryWriterPlus(log_dir=tensorboard_dest) dataset = SingleRootImageDataset( self.samples_path, to_grayscale=self.model_props['to_grayscale']) # Make reproducible: Utils.set_seed(42) #********Utils.set_seed(56) self.loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=True) self.class_names = dataset.class_names() self.num_classes = len(self.class_names) # Get the right type of model, # Don't bother getting it pretrained, # of freezing it, b/c we will overwrite # the weights: self.model = NetUtils.get_net( self.model_props['net_name'], num_classes=self.num_classes, pretrained=False, freeze=0, to_grayscale=self.model_props['to_grayscale']) self.log.info(f"Tensorboard info written to {tensorboard_dest}") self.log.info(f"Result measurement CSV file(s) written to {csv_path}")
def save_model(self, model, epoch): ''' Saves and retains trained models on disk. Within a subdir the method maintains a queue of files of len history_len: fname_1_ep_0.pth fname_2_ep_1.pth ... fname_<history_len>.pth where ep_<n> is the epoch during training where the model of that moment is being saved. When history_len model files are already present, removes the oldest. Assumptions: o self.fname_els_dict contains prop/value pairs for use in FileUtils.construct_filename() {'bs' : 32, 'lr' : 0.001, ... } o self model_fnames is a deque the size of which indicates how many models to save before discarding the oldest one as new ones are added :param model: model to save :type model: nn.module :param epoch: the epoch that created the model :type epoch: int :param history_len: number of snapshot to retain :type history_len: int ''' deque_len = len(self.model_fnames) if deque_len >= self.history_len: # Pushing a new model fname to the # front will pop the oldest from the # end. That file needs to be deleted: oldest_model_path = self.model_fnames[-1] else: # No file will need to be deleted. # Still filling our allotment: oldest_model_path = None model_fname = FileUtils.construct_filename(self.fname_els_dict, prefix='mod', suffix=f"_ep{epoch}.pth", incl_date=True) model_path = os.path.join(self.run_subdir, model_fname) # As recommended by pytorch, save the # state_dict for portability: torch.save(model.state_dict(), model_path) self.model_fnames.appendleft(model_path) if oldest_model_path is not None: try: os.remove(oldest_model_path) except Exception as e: self.log.warn(f"Could not remove old model: {repr(e)}")
def _construct_run_subdir(self, config, num_classes, model_root): ''' Constructs a directory name composed of elements specified in utility.py's FileUtils file/config info dicts. Ensures that <model_root>/subdir_name does not exist. If it does, keeps adding '_r<n>' to the end of the dir name. Final str will look like this: model_2021-03-23T15_38_39_net_resnet18_pre_True_frz_6_bs_2_folds_5_opt_SGD_ks_7_lr_0.01_gray_False Details will depend on the passed in configuration. Instance var fname_els_dict will contain all run attr/values needed for calls to FileUtils.construct_filename() :param config: run configuration :type config: NeuralNetConfig :param num_classes: number of target classes :type num_classes: int :param model_root: full path to dir where the subdir is to be created :type model_root: str :return: unique subdir name of self.model_root, which has been created :rtype: str ''' # Using config, gather run-property/value # pairs to include in the dir name: fname_els_dict = {} section_dict = config.Training for el_name, el_abbr in FileUtils.fname_long_2_short.items(): el_type = FileUtils.fname_el_types[el_abbr] if el_type == int: fname_els_dict[el_name] = section_dict.getint(el_name) elif el_type == str: fname_els_dict[el_name] = section_dict.get(el_name) elif el_type == float: fname_els_dict[el_name] = section_dict.getfloat(el_name) elif el_type == bool: fname_els_dict[el_name] = section_dict.getboolean(el_name) elif callable(el_type): # A lambda or func. Apply it: fname_els_dict[el_name] = el_type(section_dict[el_name]) fname_els_dict['num_classes'] = num_classes # Save this root name: self.fname_els_dict = fname_els_dict # Get the subdir name (without leading path): dir_basename = FileUtils.construct_filename( fname_els_dict, prefix='models', suffix=None, incl_date=True) final_dir_path = os.path.join(model_root, dir_basename) # Disambiguate by appending '_r<n>' as needed: disambiguation = 1 while os.path.exists(final_dir_path): new_basename = f"{dir_basename}_r{disambiguation}" final_dir_path = os.path.join(model_root, new_basename) disambiguation += 1 os.makedirs(final_dir_path) return final_dir_path
def run_inference(self, gpu_to_use=0): ''' Runs model over dataloader. Along the way: creates ResultTally for each batch, and maintains dict instance variable self.raw_results for later conversion of logits to class IDs under different threshold assumptions. self.raw_results: {'all_outputs' : <arr>, 'all_labels' : <arr> } Returns a ResultCollection with the ResultTally instances of each batch. :param gpu_to_use: which GPU to deploy to (if it is available) :type gpu_to_use: int :return: collection of tallies, one for each batch, or None if something went wrong. :rtype: {None | ResultCollection} ''' # Just in case the loop never runs: batch_num = -1 overall_start_time = datetime.datetime.now() try: try: if torch.cuda.is_available(): self.model.load_state_dict(torch.load(self.model_path)) FileUtils.to_device(self.model, 'gpu', gpu_to_use) else: self.model.load_state_dict( torch.load(self.model_path, map_location=torch.device('cpu'))) except RuntimeError as e: emsg = repr(e) if emsg.find("size mismatch for conv1") > -1: emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?" raise RuntimeError(emsg) from e loss_fn = nn.CrossEntropyLoss() result_coll = ResultCollection() # Save all per-class logits for ability # later to use different thresholds for # conversion to class IDs: all_outputs = [] all_labels = [] self.model.eval() num_test_samples = len(self.loader.dataset) self.log.info( f"Begin inference ({num_test_samples} test samples)...") samples_processed = 0 loop_start_time = overall_start_time with torch.no_grad(): for batch_num, (batch, targets) in enumerate(self.loader): if torch.cuda.is_available(): images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') else: images = batch labels = targets outputs = self.model(images) loss = loss_fn(outputs, labels) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') #********** max_logit = outputs[0].max().item() max_idx = (outputs.squeeze() == max_logit).nonzero( as_tuple=False).item() smpl_id = torch.utils.data.dataloader.sample_id_seq[-1] lbl = labels[0].item() pred_cl = max_idx self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl) #********** # Specify the batch_num in place # of an epoch, which is not applicatble # during testing: tally = ResultTally(batch_num, LearningPhase.TESTING, outputs, labels, loss, self.num_classes, self.batch_size) result_coll.add(tally, step=None) all_outputs.append(outputs) all_labels.append(labels) samples_processed += len(labels) del images del outputs del labels del loss torch.cuda.empty_cache() time_now = datetime.datetime.now() # Sign of life every 6 seconds: if (time_now - loop_start_time).seconds >= 5: self.log.info( f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples" ) loop_start_time = time_now finally: #********* print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}") torch.utils.data.dataloader.sample_id_seq = [] #********* time_now = datetime.datetime.now() test_time_duration = time_now - overall_start_time # A human readable duration st down to minutes: duration_str = FileUtils.time_delta_str(test_time_duration, granularity=4) self.log.info( f"Done with inference: {samples_processed} test samples; {duration_str}" ) # Total number of batches we ran: num_batches = 1 + batch_num # b/c of zero-base # If loader delivered nothing, the loop # never ran; warn, and get out: if num_batches == 0: self.log.warn( f"Dataloader delivered no data from {self.samples_path}") self.close() return None # Var all_outputs is now: # [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0 # tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1 # ... # ] # Make into one tensor: (num_batches, batch_size, num_classes), # unless an exception was raised at some point, # throwing us into this finally clause: if len(all_outputs) == 0: self.log.info( f"No outputs were produced; thus no results to report") return None self.all_outputs_tn = torch.stack(all_outputs) # Be afraid...be very afraid: assert(self.all_outputs_tn.shape == \ torch.Size([num_batches, self.batch_size, self.num_classes]) ) # Var all_labels is now num-batches tensors, # each containing batch_size labels: assert (len(all_labels) == num_batches) # list of single-number tensors. Make # into one tensor: self.all_labels_tn = torch.stack(all_labels) assert(self.all_labels_tn.shape == \ torch.Size([num_batches, self.batch_size]) ) # And equivalently: assert(self.all_labels_tn.shape == \ (self.all_outputs_tn.shape[0], self.all_outputs_tn.shape[1] ) ) self.report_results(result_coll) self.close() return result_coll
def __init__(self, config_info, debugging=False): ''' Constructor ''' self.log = LoggingService() if debugging: self.log.logging_level = DEBUG self.curr_dir = os.path.dirname(os.path.abspath(__file__)) try: self.config = self.initialize_config_struct(config_info) except Exception as e: msg = f"During config init: {repr(e)}" self.log.err(msg) raise RuntimeError(msg) from e try: self.root_train_test_data = self.config.getpath( 'Paths', 'root_train_test_data', relative_to=self.curr_dir) except ValueError as e: raise ValueError( "Config file must contain an entry 'root_train_test_data' in section 'Paths'" ) from e self.batch_size = self.config.getint('Training', 'batch_size') self.kernel_size = self.config.getint('Training', 'kernel_size') self.min_epochs = self.config.Training.getint('min_epochs') self.max_epochs = self.config.Training.getint('max_epochs') self.lr = self.config.Training.getfloat('lr') self.net_name = self.config.Training.net_name self.pretrained = self.config.Training.getboolean('pretrained', False) self.freeze = self.config.Training.getint('freeze', 0) self.to_grayscale = self.config.Training.getboolean( 'to_grayscale', True) self.set_seed(42) self.log.info("Parameter summary:") self.log.info(f"network {self.net_name}") self.log.info(f"pretrained {self.pretrained}") if self.pretrained: self.log.info(f"freeze {self.freeze}") self.log.info(f"min epochs {self.min_epochs}") self.log.info(f"max epochs {self.max_epochs}") self.log.info(f"batch_size {self.batch_size}") self.fastest_device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.num_classes = self.find_num_classes(self.root_train_test_data) self.model = NetUtils.get_net(self.net_name, num_classes=self.num_classes, pretrained=self.pretrained, freeze=self.freeze, to_grayscale=self.to_grayscale) self.log.debug( f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) FileUtils.to_device(self.model, 'gpu') self.log.debug( f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) # No cross validation: self.folds = 0 self.opt_name = self.config.Training.get('optimizer', 'Adam') # Default self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr) self.loss_fn = nn.CrossEntropyLoss() self.scheduler = optim.lr_scheduler.CosineAnnealingLR( self.optimizer, self.min_epochs) sample_width = self.config.getint('Training', 'sample_width', 400) sample_height = self.config.getint('Training', 'sample_height', 400) self.train_loader, self.val_loader = self.get_dataloader( sample_width, sample_height) self.class_names = self.train_loader.dataset.classes log_dir = os.path.join(self.curr_dir, 'runs') raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results') self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir) # Log a few example spectrograms to tensorboard; # one per class: TensorBoardPlotter.write_img_grid( self.writer, self.root_train_test_data, len(self.class_names), # Num of train examples ) # All ResultTally instances are # collected here (two per epoch, for # for all training loop runs, and one # for all val loop runs: self.step_results = ResultCollection() self.log.debug( f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) try: final_epoch = self.train() self.visualize_final_epoch_results(final_epoch) finally: self.close_tensorboard()
def train(self): overall_start_time = datetime.datetime.now() for epoch in range(self.max_epochs): self.log.info(f"Starting epoch {epoch} training") start_time = datetime.datetime.now() # Set model to train mode: self.model.train() # Training for batch, targets in self.train_loader: self.log.debug( f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, labels) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.log.debug( f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') self.remember_results(LearningPhase.TRAINING, epoch, outputs, labels, loss) del images del outputs del labels del loss torch.cuda.empty_cache() self.log.debug( f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) # Validation end_time = datetime.datetime.now() train_time_duration = end_time - start_time # A human readable duration st down to minues: duration_str = self.time_delta_str(train_time_duration, granularity=4) self.log.info( f"Done epoch {epoch} training (duration: {duration_str})") self.log.debug( f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) start_time = datetime.datetime.now() self.log.info(f"Starting epoch {epoch} validation") self.model.eval() with torch.no_grad(): for batch, targets in self.val_loader: images = FileUtils.to_device(batch, 'gpu') labels = FileUtils.to_device(targets, 'gpu') outputs = self.model(images) loss = self.loss_fn(outputs, labels) images = FileUtils.to_device(images, 'cpu') outputs = FileUtils.to_device(outputs, 'cpu') labels = FileUtils.to_device(labels, 'cpu') loss = FileUtils.to_device(loss, 'cpu') self.remember_results(LearningPhase.VALIDATING, epoch, outputs, labels, loss) del images del outputs del labels del loss torch.cuda.empty_cache() self.log.debug( f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}" ) end_time = datetime.datetime.now() val_time_duration = end_time - start_time # A human readable duration st down to minues: duration_str = self.time_delta_str(val_time_duration, granularity=4) self.log.info(f"Done validation (duration: {duration_str})") epoch_duration = train_time_duration + val_time_duration epoch_dur_str = self.time_delta_str(epoch_duration, granularity=4) cumulative_dur = end_time - overall_start_time cum_dur_str = self.time_delta_str(cumulative_dur, granularity=4) msg = f"Done epoch {epoch} (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})" self.log.info(msg) # Save model, keeping self.model_archive_size models: self.model_archive.save_model(self.model, epoch) self.scheduler.step() self.visualize_step(epoch) # Fresh results tallying self.results.clear() # Back around to next epoch self.log.info(f"Training complete after {epoch + 1} epochs") # All seems to have gone well. Report the # overall result of the final epoch for the # hparms config used in this process: self.report_hparams_summary(self.latest_result) # The final epoch number: return epoch