def save_updated_snippet(self, outdir, species, snippet_path, spectro_arr,
                             metadata):
        '''
        Create path name: 
            
            outdir/species/snippet-fname
            
        and save the spectro_arr to that path
        as a .png file with embedded metadata
        
        :param outdir: destination directory
        :type outdir: str
        :param snippet_path: file name or absolute path to snipet
        :type snippet_path: src
        :param spectro_arr: image data
        :type spectro_arr: np.array
        :param metadata: auxiliary info to include in the .png file
        :type metadata: {str : str}
        '''

        # Save the updated snippet_path:
        species_subdir = os.path.join(outdir, species)
        snip_outname = os.path.join(species_subdir,
                                    os.path.basename(snippet_path))
        FileUtils.ensure_directory_existence(snip_outname)
        SoundProcessor.save_image(spectro_arr, snip_outname, metadata)
Ejemplo n.º 2
0
    def materialize_model(self, model_path, gpu_to_use=0):

        model_fname = os.path.basename(model_path)
        
        # Extract model properties
        # from the model filename:
        self.model_props  = FileUtils.parse_filename(model_fname)
        model = NetUtils.get_net(
            self.model_props['net_name'],
            num_classes=self.model_props['num_classes'],
            pretrained=False,
            freeze=0,
            to_grayscale=self.model_props['to_grayscale']
            )

        try:
            if torch.cuda.is_available():
                self.model.load_state_dict(torch.load(self.model_path))
                FileUtils.to_device(model, 'gpu', gpu_to_use)
            else:
                self.model.load_state_dict(torch.load(
                    model_path,
                    map_location=torch.device('cpu')
                    ))
        except RuntimeError as e:
            emsg = repr(e)
            if emsg.find("size mismatch for conv1") > -1:
                emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?"
                raise RuntimeError(emsg) from e

        return model
Ejemplo n.º 3
0
    def get_dataloader(self, sample_width, sample_height):
        '''
        Returns a train and a validate dataloader
        '''
        IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm',
                          '.tif', '.tiff', '.webp')
        data_root = self.root_train_test_data

        transformation = FileUtils.get_image_transforms(sample_width,
                                                        sample_height,
                                                        to_grayscale=False)

        train_dataset = ImageFolder(
            os.path.join(data_root, 'train'),
            transformation,
            is_valid_file=lambda file: Path(file).suffix in IMG_EXTENSIONS)

        val_dataset = ImageFolder(
            os.path.join(data_root, 'validation'),
            transformation,
            is_valid_file=lambda file: Path(file).suffix in IMG_EXTENSIONS)
        train_loader = DataLoader(train_dataset,
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  drop_last=True)

        val_loader = DataLoader(val_dataset,
                                batch_size=self.batch_size,
                                shuffle=True,
                                drop_last=True)

        return train_loader, val_loader
Ejemplo n.º 4
0
    def sign_of_life(cls,
                     job,
                     num_already_present_imgs,
                     outdir,
                     start_time,
                     force_rewrite=False):

        # Time for sign of life?
        now_time = datetime.datetime.now()
        time_duration = now_time - start_time
        # Every 3 seconds, but at least 3:
        if force_rewrite \
           or (time_duration.seconds > 0 and time_duration.seconds % 3 == 0):

            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(time_duration,
                                                    granularity=4)

            # Get current and new spectro imgs in outdir:
            num_now_present_imgs = len(
                Utils.find_in_dir_tree(outdir, pattern="*.png"))
            num_newly_present_imgs = num_now_present_imgs - num_already_present_imgs

            # Keep printing number of done snippets in the same
            # terminal line:
            print((f"{job.name}---Number of spectros: {num_now_present_imgs} "
                   f"({num_newly_present_imgs} new) after {duration_str}"),
                  end='\r')
            return num_newly_present_imgs
        else:
            return num_already_present_imgs
Ejemplo n.º 5
0
    def validate_split(self, step):
        '''
        Validate one split, using that split's 
        validation fold. Return time taken. Record
        results for tensorboard and other record keeping.
        
        :param step: current combination of epoch and 
            split
        :type step: int
        :return: number of epoch seconds needed for the validation
        :rtype: int
        '''
        # Validation

        self.log.debug(
            f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        start_time = datetime.datetime.now()
        self.log.info(f"Starting validation for step {step}")

        self.model.eval()
        with torch.no_grad():
            for img_tensor, target in self.train_loader.validation_samples():
                expanded_img_tensor = unsqueeze(img_tensor, dim=0)
                expanded_target = unsqueeze(target, dim=0)

                # Update sanity record:
                self.class_coverage[int(target)]['val'] += 1

                images = FileUtils.to_device(expanded_img_tensor, 'gpu')
                label = FileUtils.to_device(expanded_target, 'gpu')

                outputs = self.model(images)
                loss = self.loss_fn(outputs, label)

                images = FileUtils.to_device(images, 'cpu')
                outputs = FileUtils.to_device(outputs, 'cpu')
                label = FileUtils.to_device(label, 'cpu')
                loss = FileUtils.to_device(loss, 'cpu')

                self.remember_results(LearningPhase.VALIDATING, step, outputs,
                                      label, loss)
                del images
                del outputs
                del label
                del loss
                torch.cuda.empty_cache()

        end_time = datetime.datetime.now()
        val_time_duration = end_time - start_time
        # A human readable duration st down to minues:
        duration_str = FileUtils.time_delta_str(val_time_duration,
                                                granularity=4)
        self.log.info(f"Done validation (duration: {duration_str})")

        return val_time_duration
Ejemplo n.º 6
0
    def test_load_preds_and_labels(self):

        tally_coll = FileUtils.load_preds_and_labels(self.csv_data_path)

        # Expect four tallies from the two
        # rows in the csv file: each row has
        # a train and a val:

        self.assertEqual(len(tally_coll), 4)
        tally = tally_coll[0]
        self.assertEqual(tally.batch_size, 64)
        self.assertEqual(str(tally.phase), 'TRAINING')
Ejemplo n.º 7
0
    def test_parse_filename(self):

        prop_dict = FileUtils.parse_filename(self.csv_data_path)

        self.assertEqual(prop_dict['timestamp'], '2021-03-11T10_59_02')
        self.assertEqual(prop_dict['net_name'], 'resnet18')
        self.assertEqual(prop_dict['pretrained'], True)
        self.assertEqual(prop_dict['lr'], 0.01)
        self.assertEqual(prop_dict['opt_name'], 'SGD')
        self.assertEqual(prop_dict['batch_size'], 64)
        self.assertEqual(prop_dict['kernel_size'], 7)
        self.assertEqual(prop_dict['num_folds'], 0)
        self.assertEqual(prop_dict['num_classes'], 10)
Ejemplo n.º 8
0
 def _instantiate_model(self, run_path_str=None, config=None):
     '''
     Returns a model based on information in 
     the config structure, or the info encoded
     in the run_path_str file name. 
     
     One of run_path_str or config must be non-None.
     If both are non-None, uses config.
     
     File paths that encode run parameters look like
     this horror:
     
     model_2021-03-11T10_59_02_net_resnet18_pretrain_0_lr_0.01_opt_SGD_bs_64_ks_7_folds_0_gray_True_classes_10.pth 
     
     :param run_path_str: a path name associated with
         a model. 
     :type run_path_str:
     :param config: run configuration structure 
     :type config: NeuralNetConfig
     :return: a model 
     :rtype: torch.nn.module
     '''
     if config is None:
         # Get a dict with info 
         # in a standard (horrible) file name:
         fname_props = FileUtils.parse_filename(run_path_str)
     else:
         fname_props = config.Training
         data_root   = config.Paths.root_train_test_data
         class_names = FileUtils.find_class_names(data_root)
         fname_props['classes'] = len(class_names)
         fname_props['pretrain'] = config.Training.getint('freeze', 0)
     
     model = NetUtils.get_net(net_name=fname_props['net_name'],
                              num_classes=fname_props['classes'],
                              freeze=fname_props['pretrain'],
                              to_grayscale=fname_props['to_grayscale']
                              )
     return model
Ejemplo n.º 9
0
    def initialize_model(self):
        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)
Ejemplo n.º 10
0
    def get_weights(cls, file_root):
        '''
        Given to root of a subdirectory,
        return a tensor of weights. The order
        of the weights corresponds to the 
        naturally sorted class names.
        
        :param file_root: full path to root
            of data file subtree
        :type file_root: str
        :return weights in naturally sorted class order
        :rtype: Tensor
        '''

        
        # Full paths of all the non-dot-starting 
        # dirs under file_root:

        #   OrderedDict{class_name : [Path(dir1), Path(dir2)]
        # The class names are already sorted:
        class_name_paths_dir = FileUtils.find_class_paths(file_root)
        
        # Create:
        #  {'class1' : <num_samples>,
        #   'class2' : <num_samples>,
        #         ...
        #   }
        
        class_populations = {}
        for class_name in class_name_paths_dir.keys():
            num_samples = 0
            # Each class may have samples in multiple
            # directories; add them up:
            for class_dir in class_name_paths_dir[class_name]:
                num_samples += len([file_name 
                                     for file_name 
                                     in os.listdir(class_dir)
                                     if Path(file_name).suffix in FileUtils.IMG_EXTENSIONS
                                     ])
            class_populations[class_name] = num_samples
            
        if len(class_populations) == 0:
            LoggingService().err(f"No target classes found under {file_root}")
            sys.exit(1)
        majority_class_population = max(class_populations.values())
        weights = []
        for class_name in class_name_paths_dir.keys():
            weights.append(class_populations[class_name] / majority_class_population)

        return torch.tensor(weights) 
Ejemplo n.º 11
0
    def test_ellipsed_file_path(self):

        # File name too long even without
        # leading dirs:
        self.assertEqual(
            FileUtils.ellipsed_file_path(
                '/home/yatagait/birds/src/birdsong/recordings/CALL_XC482431-R024 white ruffed manakin.mp3'
            ), '/home/...CALL_XC482431-R024 white ruffed manakin.mp3')
        # Same without leading slash
        self.assertEqual(
            FileUtils.ellipsed_file_path(
                'home/yatagait/birds/src/birdsong/recordings/CALL_XC482431-R024 white ruffed manakin.mp3'
            ), 'home/...CALL_XC482431-R024 white ruffed manakin.mp3')
        self.assertEqual(FileUtils.ellipsed_file_path('foobar'), 'foobar')
        self.assertEqual(FileUtils.ellipsed_file_path('foobar/fum'),
                         'foobar/fum')
        # Uneven num of dirs:
        self.assertEqual(
            FileUtils.ellipsed_file_path('foobar/bluebell/grayhound/'),
            'foobar/.../grayhound')

        # Even num of dirs
        self.assertEqual(
            FileUtils.ellipsed_file_path('blue/foobar/bluebell/grayhound/'),
            'blue/.../bluebell/grayhound')
        # Length just acceptable
        self.assertEqual(
            FileUtils.ellipsed_file_path('blue/foobar/grayhound/bar'),
            'blue/foobar/grayhound/bar')
        # Length one over acceptable
        self.assertEqual(
            FileUtils.ellipsed_file_path('Bblue/foobar/grayhound/bar'),
            'Bblue/.../grayhound/bar')
        # Absolute path:
        self.assertEqual(
            FileUtils.ellipsed_file_path('/Bblue/foobar/grayhound/bar'),
            '/Bblue/.../grayhound/bar')
Ejemplo n.º 12
0
    def test_make_run_props_dict(self):

        conf = NeuralNetConfig(self.config_path)
        training_section = conf.Training

        # Create the expected result for ground truth:
        expected_dict = {}
        for short_name, long_name in FileUtils.fname_short_2_long.items():
            try:
                val = training_section[long_name]
                expected_dict[short_name] = val
            except KeyError:
                # Config file happens not to
                # have an entry for the long_name:
                expected_dict[short_name] = 'na'
                continue

        prop_dict = FileUtils.make_run_props_dict(training_section)
        self.assertDictEqual(prop_dict, expected_dict)
Ejemplo n.º 13
0
 def find_num_classes(self, data_root):
     '''
     Expect two subdirectories under data_root:
     train and validation. Underneath each are 
     further subdirectories whose names are the
     classes:
     
             train               validation
     class1 class2 class3     class1 class2 class3
       imgs   imgs   imgs       imgs   imgs   imgs
     
     No error checking to confirm this structure
     
     :param data_root: path to parent of train/validation
     :type data_root: str
     :return: number of unique classes as obtained
         from the directory names
     :rtype: int
     '''
     self.classes = FileUtils.find_class_names(data_root)
     return len(self.classes)
Ejemplo n.º 14
0
    def test_chart_pr_curves(self):
        recall_axis = np.array([1, 2, 3, 4, 5, 6])
        curve_info = {
            1: {
                'recalls': recall_axis,
                'precisions': 2 * recall_axis,
                'avg_prec': 0.6,
                'best_op_pt': {
                    'threshold': 0.6,
                    'f1': 0.82,
                    'rec': 2,
                    'prec': 4,
                }
            },
            2: {
                'recalls': recall_axis,
                'precisions': 0.5 * recall_axis,
                'avg_prec': 0.8,
            }
        }

        num_classes, fig = \
            ClassificationPlotter.chart_pr_curves(curve_info)

        self.assertEqual(num_classes, 2)
        self.assertEqual(len(fig.axes), 1)
        ax = fig.axes[0]
        self.assertEqual(ax.get_xlabel(), 'recall')
        self.assertEqual(ax.get_ylabel(), 'precision')

        # Allow the fig to show
        # before asking user to
        # check it (the pause())
        fig.show()
        plt.pause(0.001)

        fig_ok = FileUtils.user_confirm(f"Fig should have 2 lines, one point, and a legend\n" +\
                                         f"Looks OK? (Y/n")
        if not fig_ok:
            self.fail("PR curve was not correct")
Ejemplo n.º 15
0
    def test_construct_filename(self):

        props = {
            'net_name': 'resnet18',
            'min_epochs': '3',
            'max_epochs': '6',
            'batch_size': '2',
            'num_folds': '3',
            'seed': '42',
            'kernel_size': '7',
            'sample_width': '400',
            'sample_height': '400',
            'lr': '0.01',
            'to_grayscale': 'False'
        }

        fname = FileUtils.construct_filename(props,
                                             prefix='model',
                                             suffix='.pth',
                                             incl_date=False)
        expected = 'model_net_resnet18_bs_2_folds_3_ks_7_lr_0.01_gray_False.pth'

        self.assertEqual(fname, expected)
Ejemplo n.º 16
0
    def create_csv_writer(self, raw_data_dir):
        '''
        Create a csv_writer that will fill a csv
        file during training/validation as follows:
        
            epoch  train_preds   train_labels  val_preds  val_labels
            
        Cols after the integer 'epoch' col will each be
        an array of ints:
        
                  train_preds    train_lbls   val_preds  val_lbls
                2,"[2,5,1,2,3]","[2,6,1,2,1]","[1,2]",    "[1,3]" 
        
        If raw_data_dir is provided as a str, it is
        taken as the directory where csv file with predictions
        and labels are to be written. The dir is created if necessary.
         
        If the arg is instead set to True, a dir 'runs_raw_results' is
        created under this script's directory if it does not
        exist. Then a subdirectory is created for this run,
        using the hparam settings to build a file name. The dir
        is created if needed. Result ex.:
        
              <script_dir>
                   runs_raw_results
                       Run_lr_0.001_br_32
                           run_2021_05_ ... _lr_0.001_br_32.csv
        
        
        Then file name is created, again from the run
        hparam settings. If this file exists, user is asked whether
        to remove or append. The inst var self.csv_writer is
        initialized to:
        
           o None if csv file exists, but is not to 
             be overwritten nor appended-to
           o A filed descriptor for a file open for either
             'write' or 'append.
        
        :param raw_data_dir: If simply True, create dir and file names
            from hparams, and create as needed. If a string, it is 
            assumed to be the directory where a .csv file is to be
            created. If None, self.csv_writer is set to None.
        :type raw_data_dir: {None | True | str|
        :return: CSV writer ready for action. Set either to
            write a fresh file, or append to an existing file.
            Unless file exists, and user decided not to overwrite
        :rtype: {None | csv.writer}
        '''

        # Ensure the csv file root dir exists if
        # we'll do a csv dir and run-file below it:

        if type(raw_data_dir) == str:
            raw_data_root = raw_data_dir
        else:
            raw_data_root = os.path.join(self.curr_dir, 'runs_raw_results')

        if not os.path.exists(raw_data_root):
            os.mkdir(raw_data_root)

        # Can rely on raw_data_root being defined and existing:

        if raw_data_dir is None:
            return None

        # Create both a raw dir sub-directory and a .csv file
        # for this run:
        csv_subdir_name = FileUtils.construct_filename(self.config.Training,
                                                       prefix='Run',
                                                       incl_date=True)
        os.makedirs(csv_subdir_name)

        # Create a csv file name:
        csv_file_nm = FileUtils.construct_filename(self.config.Training,
                                                   prefix='run',
                                                   suffix='.csv',
                                                   incl_date=True)

        csv_path = os.path.join(raw_data_root, csv_file_nm)

        # Get csv_raw_fd appropriately:

        if os.path.exists(csv_path):
            do_overwrite = FileUtils.user_confirm(
                f"File {csv_path} exists; overwrite?", default='N')
            if not do_overwrite:
                do_append = FileUtils.user_confirm(f"Append instead?",
                                                   default='N')
                if not do_append:
                    return None
                else:
                    mode = 'a'
        else:
            mode = 'w'

        csv_writer = CSVWriterCloseable(csv_path, mode=mode, delimiter=',')

        header = [
            'epoch', 'train_preds', 'train_labels', 'val_preds', 'val_labels'
        ]
        csv_writer.writerow(header)

        return csv_writer
Ejemplo n.º 17
0
    args = parser.parse_args()

    if type(args.device) != list:
        args.device = [args.device]

    # Expand Unix wildcards, tilde, and env
    # vars in the model paths:
    if type(args.model_paths) != list:
        model_paths_raw = [args.model_paths]
    else:
        model_paths_raw = args.model_paths

    model_paths = []
    for fname in model_paths_raw:
        model_paths.extend(FileUtils.expand_filename(fname))

    # Same for samples path, though we only allow
    # one of those paths.
    samples_path = FileUtils.expand_filename(args.samples_path)[0]

    # Ensure that the file arrangements are as required by
    # the ImageFolder class:
    #                        <root_dir>
    #        img_folder_1   img_folder_2     ...   img_folder_n
    #         img_file        img_file                  img_file
    #         img_file        img_file                  img_file
    #                   ...                  ...

    dir_struct_desc = f"Samples must be in *sub*directories with image files under {samples_path}"
    for root, dirs, _files in os.walk(samples_path):
Ejemplo n.º 18
0
    def train(self):

        overall_start_time = datetime.datetime.now()
        # Just for sanity: keep track
        # of number of batches...
        total_batch_num = 0

        # Note: since we are cross validating, the
        # data loader's set_epoch() method is only
        # called once (automatically) during instantiation
        # of the associated sampler. Moving from split
        # to split includes shuffling if the caller
        # specified that.

        # Training
        for split_num in range(self.train_loader.num_folds):

            split_start_time = datetime.datetime.now()
            self.initialize_model()
            for epoch in range(self.max_epochs):

                # Set model to train mode:
                self.model.train()

                epoch_start_time = datetime.datetime.now()

                self.log.info(f"Starting epoch {epoch} training")

                # Sanity check record: will record
                # how many samples from each class were
                # used:
                self.class_coverage = {}

                # Sanity records: will record number
                # of samples of each class that are used
                # during training and validation:
                label_distrib = {}
                batch_num = 0

                self.log.info(
                    f"Train epoch {epoch}/{self.max_epochs} split {split_num}/{self.train_loader.num_folds}"
                )
                try:
                    for batch, targets in self.train_loader:
                        # Update the sanity check
                        # num of batches seen, and distribution
                        # of samples across classes:
                        batch_num += 1
                        total_batch_num += 1

                        # Update sanity check records:
                        for lbl in targets:
                            lbl = int(lbl)
                            try:
                                label_distrib[lbl] += 1
                            except KeyError:
                                label_distrib[lbl] = 1
                            try:
                                self.class_coverage[lbl]['train'] += 1
                            except KeyError:
                                self.class_coverage[lbl] = {
                                    'train': 1,
                                    'val': 0
                                }

                        self.log.debug(
                            f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')

                        outputs = self.model(images)
                        loss = self.loss_fn(outputs, labels)
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

                        # Remember the last batch's train result of this
                        # split (results for earlier batches of
                        # the same split will be overwritten). This statement
                        # must sit before deleting output and labels:

                        step_num = self.step_number(epoch, split_num,
                                                    self.num_folds)
                        self.remember_results(LearningPhase.TRAINING, step_num,
                                              outputs, labels, loss)

                        self.log.debug(
                            f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )

                        images = FileUtils.to_device(images, 'cpu')
                        outputs = FileUtils.to_device(outputs, 'cpu')
                        labels = FileUtils.to_device(labels, 'cpu')
                        loss = FileUtils.to_device(loss, 'cpu')

                        del images
                        del outputs
                        del labels
                        del loss
                        torch.cuda.empty_cache()

                        self.log.debug(
                            f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                        )
                except EndOfSplit:

                    end_time = datetime.datetime.now()
                    train_time_duration = end_time - epoch_start_time
                    # A human readable duration st down to minutes:
                    duration_str = FileUtils.time_delta_str(
                        train_time_duration, granularity=4)

                    self.log.info(
                        f"Done training epoch {epoch} of split {split_num} (duration: {duration_str})"
                    )

                    #***********
                    #print(f"****** num_batches in split: {batch_num}" )
                    #print(f"****** LblDist: {label_distrib}")
                    #***********
                    self.validate_split(step_num)
                    self.visualize_step(step_num)
                    # Save model, keeping self.model_archive_size models:
                    self.model_archive.save_model(self.model, epoch)

                    self.log.debug(
                        f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                    )

                    # Next Epoch
                    continue

            end_time = datetime.datetime.now()
            train_time_duration = end_time - split_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(train_time_duration,
                                                    granularity=4)

            self.log.info(
                f"Done training split {split_num} (duration: {duration_str})")

            # Next split
            continue

        end_time = datetime.datetime.now()
        epoch_duration = end_time - epoch_start_time
        epoch_dur_str = FileUtils.time_delta_str(epoch_duration, granularity=4)

        cumulative_dur = end_time - overall_start_time
        cum_dur_str = FileUtils.time_delta_str(cumulative_dur, granularity=4)

        msg = f"Done epoch {epoch}  (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})"
        self.log.info(msg)

        #******self.scheduler.step()

        # Fresh results tallying
        #self.results.clear()

        self.log.info(
            f"Training complete after {self.train_loader.num_folds} splits")

        # Report the sanity checks:
        self.log.info(f"Total batches processed: {total_batch_num}")
        for cid in self.class_coverage.keys():
            train_use, val_use = self.class_coverage[cid].items()
            self.log.info(
                f"{self.class_names[cid]} Training: {train_use}, Validation: {val_use}"
            )

        # All seems to have gone well. Report the
        # overall result of the final epoch for the
        # hparms config used in this process:

        self.report_hparams_summary(self.latest_result)

        # The final epoch number:
        return epoch
Ejemplo n.º 19
0
    def prep_model_inference(self, model_path):
        '''
        1. Parses model_path into its components, and 
            creates a dict: self.model_props, which 
            contains the network type, grayscale or not,
            whether pretrained, etc.
        2. Creates self.csv_writer to write results measures
            into csv files. The destination file is determined
            as follows:
                <script_dir>/runs_raw_inferences/inf_csv_results_<datetime>/<model-props-derived-fname>.csv
        3. Creates self.writer(), a tensorboard writer with destination dir:
                <script_dir>/runs_inferences/inf_results_<datetime>
        4. Creates an ImageFolder classed dataset to self.samples_path
        5. Creates a shuffling DataLoader
        6. Initializes self.num_classes and self.class_names
        7. Creates self.model from the passed-in model_path name
        
        :param model_path: path to model that will be used for
            inference by this instance of Inferencer
        :type model_path: str
        '''

        model_fname = os.path.basename(model_path)

        # Extract model properties
        # from the model filename:
        self.model_props = FileUtils.parse_filename(model_fname)

        csv_results_root = os.path.join(self.curr_dir, 'runs_raw_inferences')
        #self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{uuid.uuid4().hex}")
        ts = FileUtils.file_timestamp()
        self.csv_dir = os.path.join(csv_results_root, f"inf_csv_results_{ts}")
        os.makedirs(self.csv_dir, exist_ok=True)

        csv_file_nm = FileUtils.construct_filename(self.model_props,
                                                   prefix='inf',
                                                   suffix='.csv',
                                                   incl_date=True)
        csv_path = os.path.join(self.csv_dir, csv_file_nm)

        self.csv_writer = CSVWriterCloseable(csv_path)

        ts = FileUtils.file_timestamp()
        tensorboard_root = os.path.join(self.curr_dir, 'runs_inferences')
        tensorboard_dest = os.path.join(tensorboard_root, f"inf_results_{ts}")
        #f"inf_results_{ts}{uuid.uuid4().hex}")
        os.makedirs(tensorboard_dest, exist_ok=True)

        self.writer = SummaryWriterPlus(log_dir=tensorboard_dest)

        dataset = SingleRootImageDataset(
            self.samples_path, to_grayscale=self.model_props['to_grayscale'])

        # Make reproducible:
        Utils.set_seed(42)
        #********Utils.set_seed(56)
        self.loader = DataLoader(dataset,
                                 batch_size=self.batch_size,
                                 shuffle=True,
                                 drop_last=True)
        self.class_names = dataset.class_names()
        self.num_classes = len(self.class_names)

        # Get the right type of model,
        # Don't bother getting it pretrained,
        # of freezing it, b/c we will overwrite
        # the weights:

        self.model = NetUtils.get_net(
            self.model_props['net_name'],
            num_classes=self.num_classes,
            pretrained=False,
            freeze=0,
            to_grayscale=self.model_props['to_grayscale'])

        self.log.info(f"Tensorboard info written to {tensorboard_dest}")
        self.log.info(f"Result measurement CSV file(s) written to {csv_path}")
Ejemplo n.º 20
0
    def save_model(self, model, epoch):
        '''
        Saves and retains trained models
        on disk. 
        
        Within a subdir the method maintains a queue
        of files of len history_len: 
        
                 fname_1_ep_0.pth
                 fname_2_ep_1.pth
                      ...
                 fname_<history_len>.pth
        
        where ep_<n> is the epoch during training
        where the model of that moment is being 
        saved.
        
        When history_len model files are already present, 
        removes the oldest.
        
        Assumptions: 
            o self.fname_els_dict contains prop/value
              pairs for use in FileUtils.construct_filename()
                 {'bs' : 32,
                  'lr' : 0.001,
                     ...
                 }
            o self model_fnames is a deque the size of
              which indicates how many models to save
              before discarding the oldest one as new
              ones are added
                 
        :param model: model to save
        :type model: nn.module
        :param epoch: the epoch that created the model
        :type epoch: int
        :param history_len: number of snapshot to retain
        :type history_len: int
        '''
        
        deque_len = len(self.model_fnames)
        if deque_len >= self.history_len:
            # Pushing a new model fname to the
            # front will pop the oldest from the
            # end. That file needs to be deleted:
            oldest_model_path = self.model_fnames[-1]
        else:
            # No file will need to be deleted.
            # Still filling our allotment:
            oldest_model_path = None
            
        model_fname = FileUtils.construct_filename(self.fname_els_dict,
                                                   prefix='mod', 
                                                   suffix=f"_ep{epoch}.pth", 
                                                   incl_date=True)
        
        model_path = os.path.join(self.run_subdir, model_fname)
        
        # As recommended by pytorch, save the
        # state_dict for portability:
        torch.save(model.state_dict(), model_path)

        self.model_fnames.appendleft(model_path)
        
        if oldest_model_path is not None:
            try:
                os.remove(oldest_model_path)
            except Exception as e:
                self.log.warn(f"Could not remove old model: {repr(e)}")
Ejemplo n.º 21
0
    def _construct_run_subdir(self, 
                             config, 
                             num_classes, 
                             model_root):
        '''
        Constructs a directory name composed of
        elements specified in utility.py's 
        FileUtils file/config info dicts.
        
        Ensures that <model_root>/subdir_name does
        not exist. If it does, keeps adding '_r<n>'
        to the end of the dir name.
        
        Final str will look like this:
        
        model_2021-03-23T15_38_39_net_resnet18_pre_True_frz_6_bs_2_folds_5_opt_SGD_ks_7_lr_0.01_gray_False
            
        Details will depend on the passed in 
        configuration.

        Instance var fname_els_dict will contain 
        all run attr/values needed for calls to 
        FileUtils.construct_filename() 
        
        :param config: run configuration
        :type config: NeuralNetConfig
        :param num_classes: number of target classes 
        :type num_classes: int
        :param model_root: full path to dir where the
            subdir is to be created
        :type model_root: str
        :return: unique subdir name of self.model_root,
            which has been created
        :rtype: str
        '''

        # Using config, gather run-property/value 
        # pairs to include in the dir name:
         
        fname_els_dict = {}
        
        section_dict   = config.Training 
        
        for el_name, el_abbr in FileUtils.fname_long_2_short.items():
            
            el_type = FileUtils.fname_el_types[el_abbr]
            
            if el_type == int:
                fname_els_dict[el_name] = section_dict.getint(el_name)
            elif el_type == str:
                fname_els_dict[el_name] = section_dict.get(el_name)
            elif el_type == float:
                fname_els_dict[el_name] = section_dict.getfloat(el_name)
            elif el_type == bool:
                fname_els_dict[el_name] = section_dict.getboolean(el_name)
            elif callable(el_type):
                # A lambda or func. Apply it:
                fname_els_dict[el_name] = el_type(section_dict[el_name])

        fname_els_dict['num_classes'] = num_classes

        # Save this root name:
        self.fname_els_dict = fname_els_dict

        # Get the subdir name (without leading path):
        dir_basename = FileUtils.construct_filename(
            fname_els_dict,
            prefix='models',
            suffix=None, 
            incl_date=True)
        
        final_dir_path = os.path.join(model_root, dir_basename)
        
        # Disambiguate by appending '_r<n>' as needed: 
        disambiguation = 1
        while os.path.exists(final_dir_path):
            new_basename = f"{dir_basename}_r{disambiguation}"
            final_dir_path = os.path.join(model_root, new_basename)
            disambiguation += 1

        os.makedirs(final_dir_path)
        
        return final_dir_path 
Ejemplo n.º 22
0
    def run_inference(self, gpu_to_use=0):
        '''
        Runs model over dataloader. Along
        the way: creates ResultTally for each
        batch, and maintains dict instance variable
        self.raw_results for later conversion of
        logits to class IDs under different threshold
        assumptions. 
        
        self.raw_results: 
                {'all_outputs' : <arr>,
                 'all_labels'  : <arr>
                 }
        
        Returns a ResultCollection with the
        ResultTally instances of each batch.

        :param gpu_to_use: which GPU to deploy to (if it is available)
        :type gpu_to_use: int
        :return: collection of tallies, one for each batch,
            or None if something went wrong.
        :rtype: {None | ResultCollection}
        '''
        # Just in case the loop never runs:
        batch_num = -1
        overall_start_time = datetime.datetime.now()

        try:
            try:
                if torch.cuda.is_available():
                    self.model.load_state_dict(torch.load(self.model_path))
                    FileUtils.to_device(self.model, 'gpu', gpu_to_use)
                else:
                    self.model.load_state_dict(
                        torch.load(self.model_path,
                                   map_location=torch.device('cpu')))
            except RuntimeError as e:
                emsg = repr(e)
                if emsg.find("size mismatch for conv1") > -1:
                    emsg += " Maybe model was trained with to_grayscale=False, but local net created for grayscale?"
                    raise RuntimeError(emsg) from e

            loss_fn = nn.CrossEntropyLoss()

            result_coll = ResultCollection()

            # Save all per-class logits for ability
            # later to use different thresholds for
            # conversion to class IDs:

            all_outputs = []
            all_labels = []

            self.model.eval()
            num_test_samples = len(self.loader.dataset)
            self.log.info(
                f"Begin inference ({num_test_samples} test samples)...")

            samples_processed = 0

            loop_start_time = overall_start_time
            with torch.no_grad():

                for batch_num, (batch, targets) in enumerate(self.loader):
                    if torch.cuda.is_available():
                        images = FileUtils.to_device(batch, 'gpu')
                        labels = FileUtils.to_device(targets, 'gpu')
                    else:
                        images = batch
                        labels = targets

                    outputs = self.model(images)
                    loss = loss_fn(outputs, labels)

                    images = FileUtils.to_device(images, 'cpu')
                    outputs = FileUtils.to_device(outputs, 'cpu')
                    labels = FileUtils.to_device(labels, 'cpu')
                    loss = FileUtils.to_device(loss, 'cpu')

                    #**********
                    max_logit = outputs[0].max().item()
                    max_idx = (outputs.squeeze() == max_logit).nonzero(
                        as_tuple=False).item()
                    smpl_id = torch.utils.data.dataloader.sample_id_seq[-1]
                    lbl = labels[0].item()
                    pred_cl = max_idx

                    self.curr_dict[smpl_id] = (smpl_id, lbl, pred_cl)
                    #**********

                    # Specify the batch_num in place
                    # of an epoch, which is not applicatble
                    # during testing:
                    tally = ResultTally(batch_num, LearningPhase.TESTING,
                                        outputs, labels, loss,
                                        self.num_classes, self.batch_size)
                    result_coll.add(tally, step=None)

                    all_outputs.append(outputs)
                    all_labels.append(labels)

                    samples_processed += len(labels)

                    del images
                    del outputs
                    del labels
                    del loss

                    torch.cuda.empty_cache()

                    time_now = datetime.datetime.now()
                    # Sign of life every 6 seconds:
                    if (time_now - loop_start_time).seconds >= 5:
                        self.log.info(
                            f"GPU{gpu_to_use} processed {samples_processed}/{num_test_samples} samples"
                        )
                        loop_start_time = time_now
        finally:

            #*********
            print(f"Sample seq: {torch.utils.data.dataloader.sample_id_seq}")
            torch.utils.data.dataloader.sample_id_seq = []
            #*********
            time_now = datetime.datetime.now()
            test_time_duration = time_now - overall_start_time
            # A human readable duration st down to minutes:
            duration_str = FileUtils.time_delta_str(test_time_duration,
                                                    granularity=4)
            self.log.info(
                f"Done with inference: {samples_processed} test samples; {duration_str}"
            )
            # Total number of batches we ran:
            num_batches = 1 + batch_num  # b/c of zero-base

            # If loader delivered nothing, the loop
            # never ran; warn, and get out:
            if num_batches == 0:
                self.log.warn(
                    f"Dataloader delivered no data from {self.samples_path}")
                self.close()
                return None

            # Var all_outputs is now:
            #  [tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample0
            #   tensor([pred_cl0, pred_cl1, pred_cl<num_classes - 1>], # For sample1
            #                     ...
            #   ]
            # Make into one tensor: (num_batches, batch_size, num_classes),
            # unless an exception was raised at some point,
            # throwing us into this finally clause:
            if len(all_outputs) == 0:
                self.log.info(
                    f"No outputs were produced; thus no results to report")
                return None

            self.all_outputs_tn = torch.stack(all_outputs)
            # Be afraid...be very afraid:
            assert(self.all_outputs_tn.shape == \
                   torch.Size([num_batches,
                               self.batch_size,
                               self.num_classes])
                   )

            # Var all_labels is now num-batches tensors,
            # each containing batch_size labels:
            assert (len(all_labels) == num_batches)

            # list of single-number tensors. Make
            # into one tensor:
            self.all_labels_tn = torch.stack(all_labels)
            assert(self.all_labels_tn.shape == \
                   torch.Size([num_batches, self.batch_size])
                   )
            # And equivalently:
            assert(self.all_labels_tn.shape == \
                   (self.all_outputs_tn.shape[0],
                    self.all_outputs_tn.shape[1]
                    )
                   )

            self.report_results(result_coll)
            self.close()

        return result_coll
Ejemplo n.º 23
0
    def __init__(self, config_info, debugging=False):
        '''
        Constructor
        '''

        self.log = LoggingService()
        if debugging:
            self.log.logging_level = DEBUG

        self.curr_dir = os.path.dirname(os.path.abspath(__file__))

        try:
            self.config = self.initialize_config_struct(config_info)
        except Exception as e:
            msg = f"During config init: {repr(e)}"
            self.log.err(msg)
            raise RuntimeError(msg) from e

        try:
            self.root_train_test_data = self.config.getpath(
                'Paths', 'root_train_test_data', relative_to=self.curr_dir)
        except ValueError as e:
            raise ValueError(
                "Config file must contain an entry 'root_train_test_data' in section 'Paths'"
            ) from e

        self.batch_size = self.config.getint('Training', 'batch_size')
        self.kernel_size = self.config.getint('Training', 'kernel_size')
        self.min_epochs = self.config.Training.getint('min_epochs')
        self.max_epochs = self.config.Training.getint('max_epochs')
        self.lr = self.config.Training.getfloat('lr')
        self.net_name = self.config.Training.net_name
        self.pretrained = self.config.Training.getboolean('pretrained', False)
        self.freeze = self.config.Training.getint('freeze', 0)
        self.to_grayscale = self.config.Training.getboolean(
            'to_grayscale', True)

        self.set_seed(42)

        self.log.info("Parameter summary:")
        self.log.info(f"network     {self.net_name}")
        self.log.info(f"pretrained  {self.pretrained}")
        if self.pretrained:
            self.log.info(f"freeze      {self.freeze}")
        self.log.info(f"min epochs  {self.min_epochs}")
        self.log.info(f"max epochs  {self.max_epochs}")
        self.log.info(f"batch_size  {self.batch_size}")

        self.fastest_device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.num_classes = self.find_num_classes(self.root_train_test_data)

        self.model = NetUtils.get_net(self.net_name,
                                      num_classes=self.num_classes,
                                      pretrained=self.pretrained,
                                      freeze=self.freeze,
                                      to_grayscale=self.to_grayscale)
        self.log.debug(
            f"Before any gpu push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        FileUtils.to_device(self.model, 'gpu')

        self.log.debug(
            f"Before after model push: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )

        # No cross validation:
        self.folds = 0
        self.opt_name = self.config.Training.get('optimizer',
                                                 'Adam')  # Default
        self.optimizer = self.get_optimizer(self.opt_name, self.model, self.lr)

        self.loss_fn = nn.CrossEntropyLoss()
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer, self.min_epochs)

        sample_width = self.config.getint('Training', 'sample_width', 400)
        sample_height = self.config.getint('Training', 'sample_height', 400)
        self.train_loader, self.val_loader = self.get_dataloader(
            sample_width, sample_height)
        self.class_names = self.train_loader.dataset.classes

        log_dir = os.path.join(self.curr_dir, 'runs')
        raw_data_dir = os.path.join(self.curr_dir, 'runs_raw_results')

        self.setup_tensorboard(log_dir, raw_data_dir=raw_data_dir)

        # Log a few example spectrograms to tensorboard;
        # one per class:
        TensorBoardPlotter.write_img_grid(
            self.writer,
            self.root_train_test_data,
            len(self.class_names),  # Num of train examples
        )

        # All ResultTally instances are
        # collected here (two per epoch, for
        # for all training loop runs, and one
        # for all val loop runs:

        self.step_results = ResultCollection()

        self.log.debug(
            f"Just before train: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
        )
        try:
            final_epoch = self.train()
            self.visualize_final_epoch_results(final_epoch)
        finally:
            self.close_tensorboard()
Ejemplo n.º 24
0
    def train(self):

        overall_start_time = datetime.datetime.now()
        for epoch in range(self.max_epochs):

            self.log.info(f"Starting epoch {epoch} training")
            start_time = datetime.datetime.now()

            # Set model to train mode:
            self.model.train()

            # Training
            for batch, targets in self.train_loader:

                self.log.debug(
                    f"Top of training loop: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                )

                images = FileUtils.to_device(batch, 'gpu')
                labels = FileUtils.to_device(targets, 'gpu')

                outputs = self.model(images)
                loss = self.loss_fn(outputs, labels)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

                self.log.debug(
                    f"Just before clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                )

                images = FileUtils.to_device(images, 'cpu')
                outputs = FileUtils.to_device(outputs, 'cpu')
                labels = FileUtils.to_device(labels, 'cpu')
                loss = FileUtils.to_device(loss, 'cpu')

                self.remember_results(LearningPhase.TRAINING, epoch, outputs,
                                      labels, loss)

                del images
                del outputs
                del labels
                del loss
                torch.cuda.empty_cache()

                self.log.debug(
                    f"Just after clearing gpu: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
                )

            # Validation

            end_time = datetime.datetime.now()
            train_time_duration = end_time - start_time
            # A human readable duration st down to minues:
            duration_str = self.time_delta_str(train_time_duration,
                                               granularity=4)

            self.log.info(
                f"Done epoch {epoch} training (duration: {duration_str})")

            self.log.debug(
                f"Start of validation: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
            )

            start_time = datetime.datetime.now()
            self.log.info(f"Starting epoch {epoch} validation")

            self.model.eval()
            with torch.no_grad():
                for batch, targets in self.val_loader:
                    images = FileUtils.to_device(batch, 'gpu')
                    labels = FileUtils.to_device(targets, 'gpu')

                    outputs = self.model(images)
                    loss = self.loss_fn(outputs, labels)

                    images = FileUtils.to_device(images, 'cpu')
                    outputs = FileUtils.to_device(outputs, 'cpu')
                    labels = FileUtils.to_device(labels, 'cpu')
                    loss = FileUtils.to_device(loss, 'cpu')

                    self.remember_results(LearningPhase.VALIDATING, epoch,
                                          outputs, labels, loss)
                    del images
                    del outputs
                    del labels
                    del loss
                    torch.cuda.empty_cache()

            self.log.debug(
                f"After eval: \n{'none--on CPU' if self.fastest_device.type == 'cpu' else torch.cuda.memory_summary()}"
            )

            end_time = datetime.datetime.now()
            val_time_duration = end_time - start_time
            # A human readable duration st down to minues:
            duration_str = self.time_delta_str(val_time_duration,
                                               granularity=4)
            self.log.info(f"Done validation (duration: {duration_str})")

            epoch_duration = train_time_duration + val_time_duration
            epoch_dur_str = self.time_delta_str(epoch_duration, granularity=4)

            cumulative_dur = end_time - overall_start_time
            cum_dur_str = self.time_delta_str(cumulative_dur, granularity=4)

            msg = f"Done epoch {epoch}  (epoch duration: {epoch_dur_str}; cumulative: {cum_dur_str})"
            self.log.info(msg)

            # Save model, keeping self.model_archive_size models:
            self.model_archive.save_model(self.model, epoch)

            self.scheduler.step()

            self.visualize_step(epoch)

            # Fresh results tallying
            self.results.clear()

            # Back around to next epoch

        self.log.info(f"Training complete after {epoch + 1} epochs")

        # All seems to have gone well. Report the
        # overall result of the final epoch for the
        # hparms config used in this process:

        self.report_hparams_summary(self.latest_result)

        # The final epoch number:
        return epoch