Esempio n. 1
0
def print_prediction(config):
    cache_m = CacheManager()

    if not os.path.isfile(cache_m.fileLocation('test_pred.pik')):
        return None

    #Load predictions
    (expected, Y_pred, nclasses) = cache_m.load('test_pred.pik')
    y_pred = np.argmax(Y_pred, axis=1)

    #Output metrics
    if nclasses > 2:
        f1 = metrics.f1_score(expected, y_pred, average='weighted')
    else:
        f1 = metrics.f1_score(expected, y_pred, pos_label=1)
    print("F1 score: {0:.2f}".format(f1))

    m_conf = PrintConfusionMatrix(y_pred, expected, nclasses, config, "TILs")

    #ROC AUC
    #Get positive scores (binary only)
    if nclasses == 2:
        scores = Y_pred.transpose()[1]
        fpr, tpr, thresholds = metrics.roc_curve(expected, scores, pos_label=1)
        print("AUC: {0:f}".format(metrics.roc_auc_score(expected, scores)))

    print("Accuracy: {0:.3f}".format(m_conf[nclasses + 2][nclasses]))

    if config.verbose > 1:
        print("False positive rates: {0}".format(fpr))
        print("True positive rates: {0}".format(tpr))
        print("Thresholds: {0}".format(thresholds))
Esempio n. 2
0
 def __init__(self, config, ds, name=None):
     super().__init__(config, ds, name=name)
     if name is None:
         self.name = "VGG16_A1"
     self._modelCache = "{0}-model.h5".format(self.name)
     self._weightsCache = "{0}-weights.h5".format(self.name)
     self._mgpu_weightsCache = "{0}-mgpu-weights.h5".format(self.name)
     self.cache_m = CacheManager()
     self.cache_m.registerFile(
         os.path.join(config.model_path, self._modelCache),
         self._modelCache)
     self.cache_m.registerFile(
         os.path.join(config.weights_path, self._weightsCache),
         self._weightsCache)
     self.cache_m.registerFile(
         os.path.join(config.weights_path, self._mgpu_weightsCache),
         self._mgpu_weightsCache)
Esempio n. 3
0
def preprocess_data(config, img_types):
    """
    Main function in preprocessing.

    Works through estipulated configuration.
    """

    #Check SRC and DST directories
    if not os.path.exists(config.presrc):
        if config.verbose > 0:
            print("[Preprocess] No such directory: {0}".format(config.presrc))
        sys.exit(Exitcodes.PATH_ERROR)
    if not os.path.exists(config.predst):
        os.makedirs(config.predst)

    #If SRC dir has already been scanned, no need to redo:
    cache_m = CacheManager(verbose=config.verbose)
    datatree = None
    if config.tcga:
        datatree = cache_m.load('tcga.pik')
        if datatree is None:
            datatree = TCGAMerger.Merger(config.presrc, config.verbose)
            cache_m.dump(datatree, 'tcga.pik')
    else:
        if cache_m.checkFileExistence('datatree.pik'):
            imglist, lablist, path = cache_m.load('datatree.pik')
            if path == config.presrc:
                datatree = GenericData.ImageSource((imglist, lablist),
                                                   config.presrc, img_types)

        if datatree is None:
            datatree = GenericData.ImageSource(None, config.presrc, img_types)
            cache_m.dump(
                (datatree.getData(), datatree.getLabelsList(), config.presrc),
                'datatree.pik')

    #Produce tiles from input images
    #TODO: implement parallel tiling, choose between multiprocess tiling (multiple images processed in parallel) or single process (one image
    #at a time, but work divided in threads
    if config.tile:
        if config.multiprocess:
            multiprocess_run(make_singleprocesstiling, (config, ),
                             datatree,
                             step_size=20)
            #make_multiprocesstiling(datatree,config)
        else:
            make_singleprocesstiling(datatree, config)
    elif not config.normalize is None:
        make_singleprocessnorm(datatree, config)
Esempio n. 4
0
    def __init__(self, data_path, keepImg=False, config=None, name='Generic'):
        self.path = None
        if isinstance(data_path, str) and os.path.isdir(data_path):
            self.path = data_path
        else:
            raise ValueError(
                "[GenericImage] Path does not correspond to a file ({0}).".
                format(data_path))

        self.X = None
        self.Y = None
        self.name = name
        self.multi_dir = True
        self._cache = CacheManager()
        self._keep = keepImg
        self._cpu_count = config.cpu_count if not config is None else 1
        self._verbose = config.verbose if not config is None else 0
        self._pbar = config.progressbar if not config is None else False
        self._config = config
Esempio n. 5
0
def run_training(config,locations=None):
    """
    Main training function, to work as a new process
    """
    if config.info:
        print("Starting active learning process....")

    if not locations is None:
        cache_m = CacheManager(locations=locations)
    trainer = ActiveLearningTrainer(config)
    trainer.run()
Esempio n. 6
0
    def __init__(self, config, ds, name=None):
        super().__init__(config, ds, name=name)
        if name is None:
            self.name = "ExtendedKerasNet"
        self._modelCache = "{0}-model.h5".format(self.name)
        self._weightsCache = "{0}-weights.h5".format(self.name)
        self._mgpu_weightsCache = "{0}-mgpu-weights.h5".format(self.name)

        self.cache_m = CacheManager()
        self.cache_m.registerFile(
            os.path.join(config.model_path, self._modelCache),
            self._modelCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._weightsCache),
            self._weightsCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._mgpu_weightsCache),
            self._mgpu_weightsCache)

        self.single = None
        self.parallel = None
Esempio n. 7
0
def run_prediction(config, locations=None):
    """
    Main training function, to work as a new process
    """
    if config.info:
        print("Starting prediction process....")

    if not locations is None:
        cache_m = CacheManager(locations=locations)
    if config.print_pred:
        print_prediction(config)
    else:
        predictor = Predictor(config)
        predictor.run()
Esempio n. 8
0
    def _save_weights(self, model, single, parallel, clear_sess, save_numpy):
        #Save weights for single tower model and for multigpu model (if defined)
        cache_m = CacheManager()
        if self._config.info:
            print("Saving weights, this could take a while...")
        if save_numpy and hasattr(model, 'get_npweights_cache'):
            np.save(model.get_npweights_cache(), single.get_weights())
        else:
            single.save_weights(model.get_weights_cache())
            single.save(model.get_model_cache())

        if not parallel is None and not model.get_mgpu_weights_cache() is None:
            if save_numpy and hasattr(model, 'get_npmgpu_weights_cache'):
                np.save(model.get_npmgpu_weights_cache(),
                        parallel.get_weights())
            else:
                parallel.save_weights(model.get_mgpu_weights_cache())
        cache_m.dump(tuple(self._config.split), 'split_ratio.pik')

        if clear_sess:
            K.clear_session()

        return Exitcodes.ALL_GOOD
Esempio n. 9
0
    def get_dataset_dimensions(self, X=None):
        """
        Returns the dimensions of the images in the dataset. It's possible to have different image dimensions.
        WARNING: big datasets will take forever to run. For now, checks a sample of the images.
        TODO: Reimplement this function to be fully parallel (threads in case).

        Return: SORTED list of tuples (# samples,width,height,channels)
        """

        cache_m = CacheManager()
        reload_data = False
        if cache_m.checkFileExistence('data_dims.pik'):
            try:
                dims, name = cache_m.load('data_dims.pik')
            except ValueError:
                reload_data = True
            if name != self.name:
                reload_data = True
        else:
            reload_data = True

        if reload_data:
            dims = set()
            if X is None and self.X is None:
                return None
            elif X is None:
                X = self.X

            samples = len(X)
            if self._config.info:
                print(
                    "Checking a sample of dataset images for different dimensions..."
                )

            s_number = int(0.02 * samples)
            upper_limit = 5000 if s_number > 5000 else s_number
            for seg in random.sample(X, upper_limit):
                dims.add((samples, ) + seg.getImgDim())
            cache_m.dump((dims, self.name), 'data_dims.pik')

        l = list(dims)
        l.sort()
        return l
Esempio n. 10
0
    def run(self):
        """
        Coordenates the AL process
        """
        from keras import backend as K
        import time
        from datetime import timedelta
        
        #Loaded CNN model and Datasource
        model = self.load_modules()
        self._rex = self._rex.format(model.name)
        #Define initial sets
        self.configure_sets()
        #AL components
        cache_m = CacheManager()
        predictor = Predictor(self._config,keepImg=True)
        function = None
        
        if not self._config.ac_function is None:
            acq = importlib.import_module('AL','AcquisitionFunctions')
            function = getattr(acq,self._config.ac_function)
        else:
            print("You should specify an acquisition function")
            sys.exit(Exitcodes.RUNTIME_ERROR)

        stime = None
        etime = None
        sw_thread = None
        end_train = False
        for r in range(self._config.acquisition_steps):
            if self._config.info:
                print("[ALTrainer] Starting acquisition step {0}/{1}".format(r+1,self._config.acquisition_steps))
                stime = time.time()

            #Save current dataset and report partial result (requires multi load for reading)
            fid = 'al-metadata-{1}-r{0}.pik'.format(r,model.name)
            cache_m.registerFile(os.path.join(self._config.logdir,fid),fid)
            cache_m.dump(((self.train_x,self.train_y),(self.val_x,self.val_y),(self.test_x,self.test_y)),fid)
                
            sw_thread = self.train_model(model,(self.train_x,self.train_y),(self.val_x,self.val_y))            
            
            if r == (self._config.acquisition_steps - 1) or not self.acquire(function,model,acquisition=r,sw_thread=sw_thread):
                if self._config.info:
                    print("[ALTrainer] No more acquisitions are in order")
                end_train = True
                    
            #Some models may take too long to save weights
            if not sw_thread is None and sw_thread.is_alive():
                if self._config.info:
                    print("[ALTrainer] Waiting for model weights...")
                sw_thread.join()
                    
            #Set load_full to false so dropout is disabled
            predictor.run(self.test_x,self.test_y,load_full=False)
            
            #Attempt to free GPU memory
            K.clear_session()
            
            if self._config.info:
                etime = time.time()
                td = timedelta(seconds=(etime-stime))
                print("Acquisition step took: {0}".format(td))
                
            if end_train:
                return None
Esempio n. 11
0
class Inception(GenericModel):
    """
    Implements abstract methods from GenericModel.
    Model is the same as in: https://github.com/keras-team/keras-applications/blob/master/keras_applications/inception_resnet_v2.py
    Addapted to provide a Bayesian model
    """
    def __init__(self, config, ds, name=None):
        super().__init__(config, ds, name=name)
        if name is None:
            self.name = "Inception"
        self._modelCache = "{0}-model.h5".format(self.name)
        self._weightsCache = "{0}-weights.h5".format(self.name)
        self._mgpu_weightsCache = "{0}-mgpu-weights.h5".format(self.name)

        self.cache_m = CacheManager()
        self.cache_m.registerFile(
            os.path.join(config.model_path, self._modelCache),
            self._modelCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._weightsCache),
            self._weightsCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._mgpu_weightsCache),
            self._mgpu_weightsCache)

        self.single = None
        self.parallel = None

    def get_model_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._modelCache)

    def get_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._weightsCache)

    def get_mgpu_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._mgpu_weightsCache)

    def get_npweights_cache(self, add_ext=False):
        """
        Returns path to model cache.

        @param add_ext <boolean>: add numpy file extension to file name.
        """
        if add_ext:
            return "{}.npy".format(
                self.cache_m.fileLocation(self._weightsCache).split('.')[0])
        else:
            return self.cache_m.fileLocation(self._weightsCache).split('.')[0]

    def get_npmgpu_weights_cache(self, add_ext=False):
        """
        Returns path to model cache

        @param add_ext <boolean>: add numpy file extension to file name.
        """
        if add_ext:
            return "{}.npy".format(
                self.cache_m.fileLocation(
                    self._mgpu_weightsCache).split('.')[0])
        else:
            return self.cache_m.fileLocation(
                self._mgpu_weightsCache).split('.')[0]

    def register_ensemble(self, m):
        self._model_n = m
        self._weightsCache = "{0}-EM{1}-weights.h5".format(self.name, m)
        self._mgpu_weightsCache = "{0}-EM{1}-mgpu-weights.h5".format(
            self.name, m)
        self._modelCache = "{0}-EM{1}-model.h5".format(self.name, m)

        self.cache_m.registerFile(
            os.path.join(self._config.weights_path, self._weightsCache),
            self._weightsCache)
        self.cache_m.registerFile(
            os.path.join(self._config.weights_path, self._mgpu_weightsCache),
            self._mgpu_weightsCache)
        self.cache_m.registerFile(
            os.path.join(self._config.model_path, self._modelCache),
            self._modelCache)

    def return_model_n(self):
        if hasattr(self, '_model_n'):
            return self._model_n
        else:
            return -1

    def build(self, **kwargs):
        """
        @param pre_trained <boolean>: returned model should be pre-trained or not
        @param data_size <int>: size of the training dataset
        """
        model, parallel_model = self._build(**kwargs)

        self.single = model
        self.parallel = parallel_model

        return (model, parallel_model)

    def build_extractor(self, **kwargs):
        """
        Builds a feature extractor.
        
        Weights should be loaded by caller!

        Key word arguments:
        preload_w: return model with weights already loaded? True -> Yes
        parallel: return parallel model (overrides gpu_count avaliation)? True -> Yes
        """
        #Weight loading for the feature extraction is done latter by requesting party
        kwargs['preload_w'] = False

        if 'parallel' in kwargs and not kwargs['parallel']:
            s, p = self._build(**kwargs)
            return (s, None)
        else:
            return self._build(**kwargs)

    def build_ensemble(self, **kwargs):
        """
        Builds an ensemble of M Inception models.

        Weights are loaded here because of the way ensembles should be built.

        Default build: avareges the output of the corresponding softmaxes

        @param npfile <boolean>: loads weights from numpy files
        """

        if 'npfile' in kwargs:
            npfile = kwargs['npfile']
        else:
            npfile = False

        s_models = []
        p_models = []
        for m in range(self._config.emodels):
            self.register_ensemble(m)
            single, parallel = self._build(**kwargs)

            if not parallel is None:
                if npfile and hasattr(model, 'get_npmgpu_weights_cache'):
                    parallel.set_weights(
                        np.load(model.get_npmgpu_weights_cache(),
                                allow_pickle=True))
                    if self._config.info:
                        print("[Inception] loaded ensemble weights: {}".format(
                            model.get_npmgpu_weights_cache()))
                elif os.path.isfile(model.get_mgpu_weights_cache()):
                    parallel.load_weights(model.get_mgpu_weights_cache(),
                                          by_name=True)
                    if self._config.info:
                        print("[Inception] loaded ensemble weights: {}".format(
                            model.get_mgpu_weights_cache()))
            else:
                parallel = None

            if npfile and hasattr(model, 'get_npweights_cache'):
                single.set_weights(
                    np.load(model.get_weights_cache(), allow_pickle=True))
                if self._config.info:
                    print("[Inception] loaded ensemble weights: {}".format(
                        model.get_npweights_cache()))
            elif os.path.isfile(model.get_weights_cache()):
                single.load_weights(model.get_weights_cache(), by_name=True)
            else:
                if self._config.info:
                    print(
                        "[Inception] Could not load ensemble weights (model {})"
                        .format(m))
                single = None
            s_models.append(single)
            p_models.append(parallel)

        s_inputs = [inp for s in s_models for inp in s.inputs]
        s_outputs = [out for s in s_models for out in s.outputs]
        p_models = list(filter(lambda x: not x is None, p_models))
        if len(p_models) > 0:
            p_inputs = [inp for p in p_models for inp in p.inputs]
            p_outputs = [out for p in p_models for out in p.outputs]
        else:
            p_inputs = None
            p_outputs = None

        #Build the ensemble output from individual models
        s_model, p_model = None, None
        ##Single GPU model
        x = Average()(s_outputs)
        s_model = Model(inputs=s_inputs, outputs=x)

        ##Parallel model
        if not p_inputs is None:
            x = Average()(p_outputs)
            p_model = Model(inputs=p_inputs, outputs=x)

        return s_model, p_model

    def _build(self, **kwargs):

        width, height, channels = self._check_input_shape()

        if 'data_size' in kwargs:
            self.data_size = kwargs['data_size']

        if 'training' in kwargs:
            training = kwargs['training']
        else:
            training = True

        if 'feature' in kwargs:
            feature = kwargs['feature']
        else:
            feature = False

        if 'preload_w' in kwargs:
            preload = kwargs['preload_w']
        else:
            preload = True

        if 'allocated_gpus' in kwargs and not kwargs['allocated_gpus'] is None:
            allocated_gpus = kwargs['allocated_gpus']
        else:
            allocated_gpus = self._config.gpu_count

        if backend.image_data_format() == 'channels_first':
            input_shape = (channels, height, width)
        else:
            input_shape = (height, width, channels)

        self.cache_m = CacheManager()

        model = self._build_architecture(input_shape, training, feature,
                                         preload)

        #Check if previous training and LR is saved, if so, use it
        lr_cache = "{0}_learning_rate.txt".format(self.name)
        self.cache_m.registerFile(os.path.join(self._config.cache, lr_cache),
                                  lr_cache)
        l_rate = 0.00005
        if os.path.isfile(self.cache_m.fileLocation(
                lr_cache)) and not self._config.new_net:
            l_rate = float(self.cache_m.read(lr_cache))
            if self._config.info:
                print("Found previous learning rate: {0}".format(l_rate))

        #opt = optimizers.SGD(lr=l_rate, decay=1.5e-4, momentum=0.9, nesterov=True)
        opt = optimizers.Adam(lr=l_rate)
        #opt = optimizers.Adadelta(lr=l_rate)

        #Return parallel model if multiple GPUs are available
        parallel_model = None

        if allocated_gpus > 1:
            with tf.device('/cpu:0'):
                model.compile(loss='categorical_crossentropy',
                              optimizer=opt,
                              metrics=['accuracy'])
            parallel_model = multi_gpu_model(model, gpus=allocated_gpus)
            parallel_model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )
        else:
            model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )

        return (model, parallel_model)

    def _build_architecture(self,
                            input_shape,
                            training=None,
                            feature=False,
                            preload=True):
        from . import inception_resnet_v2

        kwargs = {
            'training': training,
            'feature': feature,
            'custom_top': False,
            'preload': preload,
            'batch_n': True if self._config.gpu_count <= 1 else False
        }

        inp = Input(shape=input_shape)

        inception_body = inception_resnet_v2.InceptionResNetV2(
            include_top=False,
            weights='imagenet',
            input_tensor=inp,
            input_shape=input_shape,
            pooling='avg',
            classes=self._ds.nclasses,
            **kwargs)

        return inception_body
Esempio n. 12
0
    def run(self):
        """
        Coordenates the AL process
        """
        from keras import backend as K
        import time
        from datetime import timedelta

        #Loaded CNN model and Datasource
        model = self.load_modules()
        self._rex = self._rex.format(model.name)
        #Define initial sets
        self.configure_sets()
        #AL components
        cache_m = CacheManager()
        predictor = Predictor(self._config, keepImg=True, build_ensemble=True)
        function = None

        if not self._config.ac_function is None:
            acq = importlib.import_module('AL', 'AcquisitionFunctions')
            function = getattr(acq, self._config.ac_function)
        else:
            print("You should specify an acquisition function")
            sys.exit(Exitcodes.RUNTIME_ERROR)

        stime = None
        etime = None
        end_train = False
        self._initializer(self._config.gpu_count, self._config.cpu_count)

        for r in range(self._config.acquisition_steps):
            if self._config.info:
                print("[EnsembleTrainer] Starting acquisition step {0}/{1}".
                      format(r + 1, self._config.acquisition_steps))
                stime = time.time()

            #Save current dataset and report partial result (requires multi load for reading)
            fid = 'al-metadata-{1}-r{0}.pik'.format(r, model.name)
            cache_m.registerFile(os.path.join(self._config.logdir, fid), fid)
            cache_m.dump(
                ((self.train_x, self.train_y), (self.val_x, self.val_y),
                 (self.test_x, self.test_y)), fid)

            self._print_stats((self.train_x, self.train_y),
                              (self.val_x, self.val_y))
            sw_thread = None
            for m in range(self._config.emodels):
                #Some models may take too long to save weights
                if not sw_thread is None:
                    if self._config.info:
                        print("[EnsembleTrainer] Waiting for model weights.",
                              end='')
                    while True:
                        pst = '.'
                        if sw_thread[-1].is_alive():
                            if self._config.info:
                                pst = "{}{}".format(pst, '.')
                                print(pst, end='')
                            sw_thread[-1].join(60.0)
                        else:
                            print('')
                            break

                if hasattr(model, 'register_ensemble'):
                    model.register_ensemble(m)
                else:
                    print(
                        "Model not ready for ensembling. Implement register_ensemble method"
                    )
                    raise AttributeError

                if self._config.info:
                    print(
                        "[EnsembleTrainer] Starting model {} training".format(
                            m))

                st = self.train_model(model, (self.train_x, self.train_y),
                                      (self.val_x, self.val_y),
                                      set_session=False,
                                      stats=False,
                                      summary=False,
                                      clear_sess=True,
                                      save_numpy=True)
                if sw_thread is None:
                    sw_thread = [st]
                else:
                    sw_thread.append(st)

            if r == (self._config.acquisition_steps - 1) or not self.acquire(
                    function, model, acquisition=r, sw_thread=sw_thread):
                if self._config.info:
                    print("[ALTrainer] No more acquisitions are in order")
                end_train = True

            #Set load_full to false so dropout is disabled
            predictor.run(self.test_x, self.test_y, load_full=False)

            #Attempt to free GPU memory
            K.clear_session()

            if self._config.info:
                etime = time.time()
                td = timedelta(seconds=(etime - stime))
                print("Acquisition step took: {0}".format(td))

            if end_train:
                return None
Esempio n. 13
0
def ensemble_varratios(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        emodels = config.emodels
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[ensemble_varratios] GenericModel is needed by ensemble_varratios. Set model kw argument"
        )
        return None

    if 'sw_thread' in kwargs:
        sw_thread = kwargs['sw_thread']
    else:
        sw_thread = None

    fidp = None
    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        if config.debug:
            fidp = 'al-probs-{1}-r{0}.pik'.format(r, config.ac_function)
            cache_m.registerFile(os.path.join(config.logdir, fidp), fidp)

    All_Dropout_Classes = np.zeros(shape=(data_size, 1))

    #If sw_thread was provided, we should check the availability of model weights
    if not sw_thread is None:
        for k in range(len(sw_thread)):
            if sw_thread[k].is_alive():
                print(
                    "Waiting ensemble model {} weights' to become available..."
                    .format(k))
                sw_thread[k].join()

    if pbar:
        l = tqdm(range(emodels),
                 desc="Ensemble member predictions",
                 position=0)
    else:
        if config.info:
            print("Starting Ensemble sampling...")
        l = range(emodels)

    #Keep probabilities for analysis
    all_probs = None
    if config.debug:
        all_probs = np.zeros(shape=(emodels, data_size, generator.classes))

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, emodels))

        model.register_ensemble(d)
        single, parallel = model.build(pre_load=False)

        if hasattr(model, 'get_npweights_cache'):
            spath = model.get_npweights_cache(add_ext=True)
            npfile = True
        else:
            spath = model.get_weights_cache()
            npfile = False

        if hasattr(model, 'get_npmgpu_weights_cache'):
            ppath = model.get_npmgpu_weights_cache(add_ext=True)
            npfile = True
        else:
            ppath = model.get_mgpu_weights_cache()
            npfile = False

        pred_model = _load_model_weights(config, single, spath, parallel,
                                         ppath, sw_thread, npfile)

        #Keep verbosity in 0 to gain speed
        proba = pred_model.predict_generator(generator,
                                             workers=5 * cpu_count,
                                             max_queue_size=100 * gpu_count,
                                             verbose=0)

        if config.debug:
            all_probs[d] = proba

        dropout_classes = proba.argmax(axis=-1)
        dropout_classes = np.array([dropout_classes]).T
        All_Dropout_Classes = np.append(All_Dropout_Classes,
                                        dropout_classes,
                                        axis=1)

    if verbose > 0:
        print("All dropout {0}:".format(All_Dropout_Classes.shape))
        for i in np.random.choice(All_Dropout_Classes.shape[0],
                                  100,
                                  replace=False):
            print("Predictions for image ({0}): {1}".format(
                i, All_Dropout_Classes[i]))

    Variation = np.zeros(shape=(data_size))

    for t in range(data_size):
        L = np.array([0])
        for d_iter in range(emodels):
            L = np.append(L, All_Dropout_Classes[t, d_iter + 1])
        Predicted_Class, Mode = mode(L[1:])
        v = np.array([1 - Mode / float(emodels)])
        Variation[t] = v

    if verbose > 1:
        print("Variation {0}:".format(data_size))
        for i in np.random.choice(data_size, 100, replace=False):
            print("Variation for image ({0}): {1}".format(i, Variation[i]))

    a_1d = Variation.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if config.debug:
        from .Common import debug_acquisition
        s_expected = generator.returnLabelsFromIndex(x_pool_index)
        #After transposition shape will be (classes,items,mc_dp)
        s_probs = all_probs[:emodels, x_pool_index].T
        debug_acquisition(s_expected, s_probs, generator.classes, cache_m,
                          config, fidp)

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's variation: {0}".format(a_1d[x_pool_index]))
        print("Maximum variation in pool: {0}".format(a_1d.max()))

    return x_pool_index
Esempio n. 14
0
def ensemble_bald(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        emodels = config.emodels
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if 'sw_thread' in kwargs:
        sw_thread = kwargs['sw_thread']
    else:
        sw_thread = None

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[ensemble_varratios] GenericModel is needed by ensemble_varratios. Set model kw argument"
        )
        return None

    #If sw_thread was provided, we should check the availability of model weights
    if not sw_thread is None:
        for k in range(len(sw_thread)):
            if sw_thread[k].is_alive():
                print(
                    "Waiting ensemble model {} weights' to become available..."
                    .format(k))
                sw_thread[k].join()

    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)

    All_Entropy_Dropout = np.zeros(shape=data_size)
    score_All = np.zeros(shape=(data_size, generator.classes))

    if pbar:
        l = tqdm(range(emodels),
                 desc="Ensemble member predictions",
                 position=0)
    else:
        if config.info:
            print("Starting ensemble sampling...")
        l = range(emodels)

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, emodels))

        model.register_ensemble(d)
        single, parallel = model.build(pre_load=False)

        if hasattr(model, 'get_npweights_cache'):
            spath = model.get_npweights_cache(add_ext=True)
            npfile = True
        else:
            spath = model.get_weights_cache()
            npfile = False

        if hasattr(model, 'get_npmgpu_weights_cache'):
            ppath = model.get_npmgpu_weights_cache(add_ext=True)
            npfile = True
        else:
            ppath = model.get_mgpu_weights_cache()
            npfile = False

        pred_model = _load_model_weights(config, single, spath, parallel,
                                         ppath, sw_thread, npfile)

        dropout_score = pred_model.predict_generator(generator,
                                                     workers=5 * cpu_count,
                                                     max_queue_size=100 *
                                                     gpu_count,
                                                     verbose=0)
        #computing G_X
        score_All = score_All + dropout_score

        #computing F_X
        dropout_score_log = np.log2(dropout_score)
        Entropy_Compute = -np.multiply(dropout_score, dropout_score_log)
        Entropy_Per_Dropout = np.sum(Entropy_Compute, axis=1)

        All_Entropy_Dropout = All_Entropy_Dropout + Entropy_Per_Dropout

    Avg_Pi = np.divide(score_All, emodels)
    Log_Avg_Pi = np.log2(Avg_Pi)
    Entropy_Avg_Pi = -np.multiply(Avg_Pi, Log_Avg_Pi)
    Entropy_Average_Pi = np.sum(Entropy_Avg_Pi, axis=1)

    G_X = Entropy_Average_Pi

    Average_Entropy = np.divide(All_Entropy_Dropout, emodels)

    F_X = Average_Entropy

    U_X = G_X - F_X

    # THIS FINDS THE MINIMUM INDEX
    # a_1d = U_X.flatten()
    # x_pool_index = a_1d.argsort()[-Queries:]

    a_1d = U_X.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's average entropy: {0}".format(
            a_1d[x_pool_index]))
        print("Maximum entropy in pool: {0}".format(a_1d.max()))

    return x_pool_index
Esempio n. 15
0
def bayesian_bald(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        mc_dp = config.dropout_steps
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)

    All_Entropy_Dropout = np.zeros(shape=data_size)
    score_All = np.zeros(shape=(data_size, generator.classes))

    if pbar:
        l = tqdm(range(mc_dp), desc="MC Dropout", position=0)
    else:
        if config.info:
            print("Starting MC dropout sampling...")
        l = range(mc_dp)

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, mc_dp))

        dropout_score = pred_model.predict_generator(generator,
                                                     workers=5 * cpu_count,
                                                     max_queue_size=100 *
                                                     gpu_count,
                                                     verbose=0)
        #computing G_X
        score_All = score_All + dropout_score

        #computing F_X
        dropout_score_log = np.log2(dropout_score)
        Entropy_Compute = -np.multiply(dropout_score, dropout_score_log)
        Entropy_Per_Dropout = np.sum(Entropy_Compute, axis=1)

        All_Entropy_Dropout = All_Entropy_Dropout + Entropy_Per_Dropout

    Avg_Pi = np.divide(score_All, mc_dp)
    Log_Avg_Pi = np.log2(Avg_Pi)
    Entropy_Avg_Pi = -np.multiply(Avg_Pi, Log_Avg_Pi)
    Entropy_Average_Pi = np.sum(Entropy_Avg_Pi, axis=1)

    G_X = Entropy_Average_Pi

    Average_Entropy = np.divide(All_Entropy_Dropout, mc_dp)

    F_X = Average_Entropy

    U_X = G_X - F_X

    # THIS FINDS THE MINIMUM INDEX
    # a_1d = U_X.flatten()
    # x_pool_index = a_1d.argsort()[-Queries:]

    a_1d = U_X.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's average entropy: {0}".format(
            a_1d[x_pool_index]))
        print("Maximum entropy in pool: {0}".format(a_1d.max()))

    return x_pool_index
Esempio n. 16
0
class KNet(GenericModel):
    """
    Implements abstract methods from GenericModel.
    Model is the same as in: https://keras.io/examples/mnist_cnn/
    """
    def __init__(self, config, ds, name=None):
        super().__init__(config, ds, name=name)
        if name is None:
            self.name = "KerasNet"
        self._modelCache = "{0}-model.h5".format(self.name)
        self._weightsCache = "{0}-weights.h5".format(self.name)
        self._mgpu_weightsCache = "{0}-mgpu-weights.h5".format(self.name)

        self.cache_m = CacheManager()
        self.cache_m.registerFile(
            os.path.join(config.model_path, self._modelCache),
            self._modelCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._weightsCache),
            self._weightsCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._mgpu_weightsCache),
            self._mgpu_weightsCache)

        self.single = None
        self.parallel = None

    def get_model_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._modelCache)

    def get_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._weightsCache)

    def get_mgpu_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._mgpu_weightsCache)

    def build(self, **kwargs):

        model, parallel_model = self._build(**kwargs)

        self.single = model
        self.parallel = parallel_model

        return (model, parallel_model)

    def _build(self, **kwargs):
        """
        @param pre_trained <boolean>: returned model should be pre-trained or not
        @param data_size <int>: size of the training dataset
        """
        width, height, channels = self._check_input_shape()

        if 'data_size' in kwargs:
            self.data_size = kwargs['data_size']

        if 'training' in kwargs:
            training = kwargs['training']
        else:
            training = True

        if 'feature' in kwargs:
            feature = kwargs['feature']
        else:
            feature = False

        if backend.image_data_format() == 'channels_first':
            input_shape = (channels, height, width)
        else:
            input_shape = (height, width, channels)

        self.cache_m = CacheManager()

        model = self._build_architecture(input_shape, training, feature)

        #Check if previous training and LR is saved, if so, use it
        lr_cache = "{0}_learning_rate.txt".format(self.name)
        self.cache_m.registerFile(os.path.join(self._config.cache, lr_cache),
                                  lr_cache)
        l_rate = 0.0005
        if os.path.isfile(self.cache_m.fileLocation(
                lr_cache)) and not self._config.new_net:
            l_rate = float(self.cache_m.read(lr_cache))
            if self._config.info:
                print("Found previous learning rate: {0}".format(l_rate))

        #opt = optimizers.SGD(lr=l_rate, decay=1.5e-4, momentum=0.9, nesterov=True)
        #opt = optimizers.Adam(lr = l_rate)
        opt = optimizers.Adadelta()

        #Return parallel model if multiple GPUs are available
        parallel_model = None

        if self._config.gpu_count > 1:
            with tf.device('/cpu:0'):
                model.compile(loss='categorical_crossentropy',
                              optimizer=opt,
                              metrics=['accuracy'])

            parallel_model = multi_gpu_model(model,
                                             gpus=self._config.gpu_count)
            parallel_model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )
        else:
            model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )

        return (model, parallel_model)

    def _build_architecture(self, input_shape, training=None, feature=False):

        model = Sequential()
        model.add(
            Convolution2D(32,
                          kernel_size=(3, 3),
                          activation='relu',
                          input_shape=input_shape))
        model.add(Convolution2D(64, (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(self._ds.nclasses, activation='softmax'))

        return model
Esempio n. 17
0
        'tcga.pik':
        os.path.join(config.cache, 'tcga.pik'),
        'metadata.pik':
        os.path.join(config.cache, '{0}-metadata.pik'.format(config.data)),
        'sampled_metadata.pik':
        os.path.join(config.cache,
                     '{0}-sampled_metadata.pik'.format(config.data)),
        'initial_train.pik':
        os.path.join(config.cache, '{0}-inittrain.pik'.format(config.data)),
        'split_ratio.pik':
        os.path.join(config.cache, '{0}-split_ratio.pik'.format(config.data)),
        'clusters.pik':
        os.path.join(config.cache, '{0}-clusters.pik'.format(config.data)),
        'data_dims.pik':
        os.path.join(config.cache, '{0}-data_dims.pik'.format(config.data)),
        'tiles.pik':
        os.path.join(config.predst, 'tiles.pik'),
        'test_pred.pik':
        os.path.join(config.logdir, 'test_pred.pik'),
        'cae_model.h5':
        os.path.join(config.model_path, 'cae_model.h5'),
        'vgg16_weights_notop.h5':
        os.path.join('PretrainedModels', 'vgg16_weights_notop.h5')
    }

    cache_m = CacheManager(locations=files)

    config.split = tuple(config.split)
    #Run main program
    main_exec(config)
Esempio n. 18
0
def bayesian_varratios(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        mc_dp = config.dropout_steps
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    fidp = None
    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        if config.debug:
            fidp = 'al-probs-{1}-r{0}.pik'.format(r, config.ac_function)
            cache_m.registerFile(os.path.join(config.logdir, fidp), fidp)

    All_Dropout_Classes = np.zeros(shape=(data_size, 1))

    if pbar:
        l = tqdm(range(mc_dp), desc="MC Dropout", position=0)
    else:
        if config.info:
            print("Starting MC dropout sampling...")
        l = range(mc_dp)

    #Keep probabilities for analysis
    all_probs = None
    if config.debug:
        all_probs = np.zeros(shape=(mc_dp, data_size, generator.classes))

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, mc_dp))

        #Keep verbosity in 0 to gain speed
        proba = pred_model.predict_generator(generator,
                                             workers=5 * cpu_count,
                                             max_queue_size=100 * gpu_count,
                                             verbose=0)

        if config.debug:
            all_probs[d] = proba

        dropout_classes = proba.argmax(axis=-1)
        dropout_classes = np.array([dropout_classes]).T
        All_Dropout_Classes = np.append(All_Dropout_Classes,
                                        dropout_classes,
                                        axis=1)

    if verbose > 0:
        print("All dropout {0}:".format(All_Dropout_Classes.shape))
        for i in np.random.choice(All_Dropout_Classes.shape[0],
                                  100,
                                  replace=False):
            print("Predictions for image ({0}): {1}".format(
                i, All_Dropout_Classes[i]))

    Variation = np.zeros(shape=(data_size))

    for t in range(data_size):
        L = np.array([0])
        for d_iter in range(mc_dp):
            L = np.append(L, All_Dropout_Classes[t, d_iter + 1])
        Predicted_Class, Mode = mode(L[1:])
        v = np.array([1 - Mode / float(mc_dp)])
        Variation[t] = v

    if verbose > 1:
        print("Variation {0}:".format(data_size))
        for i in np.random.choice(data_size, 100, replace=False):
            print("Variation for image ({0}): {1}".format(i, Variation[i]))

    a_1d = Variation.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if config.debug:
        from .Common import debug_acquisition
        s_expected = generator.returnLabelsFromIndex(x_pool_index)
        #After transposition shape will be (classes,items,mc_dp)
        s_probs = all_probs[:mc_dp, x_pool_index].T
        debug_acquisition(s_expected, s_probs, generator.classes, cache_m,
                          config, fidp)

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's variation: {0}".format(a_1d[x_pool_index]))
        print("Maximum variation in pool: {0}".format(a_1d.max()))

    return x_pool_index
Esempio n. 19
0
def km_uncert(bayesian_model, generator, data_size, **kwargs):
    """
    Cluster in K centroids and extract N samples from each cluster, based on maximum bayesian_varratios
    uncertainty.

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    sw_threads <thread Object>: if a thread object is passed, you must wait its conclusion before loading weights
    """
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    import importlib
    import copy
    import time
    from datetime import timedelta
    from Utils import CacheManager

    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        clusters = config.clusters
    else:
        return None

    if 'acquisition' in kwargs:
        acq = kwargs['acquisition']
    else:
        acq = config.acquisition_steps

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[km_uncert] GenericModel is needed by km_uncert. Set model kw argument"
        )
        return None

    ## UNCERTAINTY CALCULATION FIRST
    #Any uncertainty function could be used
    n_config = copy.copy(config)
    n_config.acquire = data_size
    kwargs['config'] = n_config
    un_function = getattr(importlib.import_module('AL'), config.un_function)
    un_indexes = un_function(bayesian_model, generator, data_size, **kwargs)

    #Models that take to long to save weights might not have finished
    if 'sw_thread' in kwargs:
        if config.ffeat is None and kwargs['sw_thread'].is_alive():
            if config.info:
                print(
                    "[km_uncert] Waiting for model weights to become available..."
                )
            kwargs['sw_thread'].join()
    elif config.info:
        print(
            "[km_uncert] Weights thread not available...trying to load weights"
        )

    if not os.path.isfile(model.get_weights_cache()) and not os.path.isfile(
            model.get_mgpu_weights_cache()):
        if config.info:
            print("[km_uncert] No trained model or weights file found")
        return None

    if config.recluster > 0 and acq > 0 and (acq % config.recluster) != 0:
        km, acquired = cache_m.load('clusters.pik')
        if config.info:
            print("[km_uncert] Loaded clusters from previous acquisition")
            #TODO: REMOVE
            print("Previous cluster size: {};\nAcquired: {}".format(
                km.labels_.shape, acquired.shape))
        km.labels_ = np.delete(km.labels_, acquired)
    else:
        #Run feature extraction and clustering
        if hasattr(model, 'build_extractor'):
            single_m, parallel_m = model.build_extractor(training=False,
                                                         feature=True,
                                                         parallel=False)
        else:
            if config.info:
                print(
                    "[km_uncert] Model is not prepared to produce features. No feature extractor"
                )
            return None

        #Model can be loaded from previous acquisition train or from a fixed final model
        if gpu_count > 1 and not parallel_m is None:
            pred_model = parallel_m
            if not config.ffeat is None and os.path.isfile(config.ffeat):
                pred_model.load_weights(config.ffeat, by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        config.ffeat))
            else:
                pred_model.load_weights(model.get_mgpu_weights_cache(),
                                        by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_mgpu_weights_cache()))
        else:
            pred_model = single_m
            if not config.ffeat is None and os.path.isfile(config.ffeat):
                pred_model.load_weights(config.ffeat, by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        config.ffeat))
            else:
                pred_model.load_weights(model.get_weights_cache(),
                                        by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_weights_cache()))

        #Extract features for all images in the pool
        if config.info:
            print("Starting feature extraction ({} batches)...".format(
                len(generator)))
        features = pred_model.predict_generator(generator,
                                                workers=4 * cpu_count,
                                                max_queue_size=100 * gpu_count,
                                                verbose=0)
        features = features.reshape(features.shape[0],
                                    np.prod(features.shape[1:]))

        if config.pca > 0:
            if config.info:
                print("Starting PCA decomposition...")

            pca = PCA(n_components=config.pca)
            features = pca.fit_transform(features)

        stime = None
        etime = None
        if config.verbose > 0:
            print("Done extraction...starting KMeans")
            stime = time.time()

        km = KMeans(n_clusters=clusters,
                    init='k-means++',
                    n_jobs=int(cpu_count / 2)).fit(features)

        if config.verbose > 0:
            etime = time.time()
            td = timedelta(seconds=(etime - stime))
            print("KMeans took {}".format(td))

    un_clusters = {k: [] for k in range(config.clusters)}

    #Distributes items in clusters in descending order of uncertainty
    for iid in un_indexes:
        un_clusters[km.labels_[iid]].append(iid)

    #Save clusters
    if config.save_var:
        fid = 'al-clustermetadata-{1}-r{0}.pik'.format(acq, model.name)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        cache_m.dump((generator.returnDataAsArray(), un_clusters, un_indexes),
                     fid)

    #If debug
    if config.debug:
        expected = generator.returnLabelsFromIndex()
        for k in range(len(un_clusters)):
            ind = np.asarray(un_clusters[k])
            print("Cluster {}, # of items: {}".format(k, ind.shape[0]))
            posa = np.ndarray(shape=(1, ), dtype=np.int32)
            for ii in range(min(ind.shape[0], 30)):
                if ii == 0:
                    posa[0] = np.where(un_indexes == ind[ii])[0]
                else:
                    posa = np.hstack(
                        (posa, np.where(un_indexes == ind[ii])[0]))
            print(
                "Cluster {} first items positions in index array (at most 30): {}"
                .format(k, posa))
            #Check % of items of each class in cluster k
            c_labels = expected[ind]
            unique, count = np.unique(c_labels, return_counts=True)
            l_count = dict(zip(unique, count))
            if len(unique) > 2:
                print("Cluster {} items:".format(k))
                print("\n".join([
                    "label {0}: {1} items".format(key, l_count[key])
                    for key in unique
                ]))
            else:
                if c_labels.shape[0] == 1:
                    l_count[c_labels[0] ^ 1] = 0
                print(
                    "Cluster {3} labels: {0} are 0; {1} are 1;\n - {2:.2f} are positives"
                    .format(l_count[0], l_count[1],
                            (l_count[1] / (l_count[0] + l_count[1])), k))

    ac_count = 0
    acquired = []
    j = 0
    while ac_count < query:
        cln = (ac_count + j) % clusters
        q = un_clusters[cln]
        if len(q) > 0:
            acquired.append(q.pop(0))
            ac_count += 1
        else:
            if verbose > 0:
                print(
                    "[km_uncert] Cluster {} exausted, will try to acquire image from cluster {}"
                    .format(cln, (cln + 1) % clusters))
            j += 1
            continue

    acquired = np.asarray(acquired)
    if config.recluster > 0:
        cache_m.dump((km, acquired), 'clusters.pik')

    return acquired
Esempio n. 20
0
class GenericDS(ABC):
    """
    Generic class for data feeders used to provide training points to Neural Nets.
    """
    def __init__(self, data_path, keepImg=False, config=None, name='Generic'):
        self.path = None
        if isinstance(data_path, str) and os.path.isdir(data_path):
            self.path = data_path
        else:
            raise ValueError(
                "[GenericImage] Path does not correspond to a file ({0}).".
                format(data_path))

        self.X = None
        self.Y = None
        self.name = name
        self.multi_dir = True
        self._cache = CacheManager()
        self._keep = keepImg
        self._cpu_count = config.cpu_count if not config is None else 1
        self._verbose = config.verbose if not config is None else 0
        self._pbar = config.progressbar if not config is None else False
        self._config = config

    @abstractmethod
    def _load_metadata_from_dir(self, d):
        pass

    @abstractmethod
    def change_root(self, imgv, path):
        """
        Check if SegImage instances in imgv are placed in the same base dir as path. If not, change paths.
        """
        pass

    def check_paths(self, imgv, path):

        for s in imgv:
            s.setPath(self.change_root(s.getPath(), path))

    def get_dataset_dimensions(self, X=None):
        """
        Returns the dimensions of the images in the dataset. It's possible to have different image dimensions.
        WARNING: big datasets will take forever to run. For now, checks a sample of the images.
        TODO: Reimplement this function to be fully parallel (threads in case).

        Return: SORTED list of tuples (# samples,width,height,channels)
        """

        cache_m = CacheManager()
        reload_data = False
        if cache_m.checkFileExistence('data_dims.pik'):
            try:
                dims, name = cache_m.load('data_dims.pik')
            except ValueError:
                reload_data = True
            if name != self.name:
                reload_data = True
        else:
            reload_data = True

        if reload_data:
            dims = set()
            if X is None and self.X is None:
                return None
            elif X is None:
                X = self.X

            samples = len(X)
            if self._config.info:
                print(
                    "Checking a sample of dataset images for different dimensions..."
                )

            s_number = int(0.02 * samples)
            upper_limit = 5000 if s_number > 5000 else s_number
            for seg in random.sample(X, upper_limit):
                dims.add((samples, ) + seg.getImgDim())
            cache_m.dump((dims, self.name), 'data_dims.pik')

        l = list(dims)
        l.sort()
        return l

    def _run_multiprocess(self, data):
        """
        This method should not be called directly. It's intended
        only for multiprocess metadata loading.
        """
        X, Y = ([], [])
        for item in data:
            t_x, t_y = self._load_metadata_from_dir(item)
            X.extend(t_x)
            Y.extend(t_y)

        return (X, Y)

    def _split_data(self, split, X, Y):
        """
        Split data in at most N sets. Returns a tuple (set1,set2,set3,setN) with the divided
        data
        """
        if sum(split) == 1.0:
            it_count = 0
            split_data = []
            start_idx = 0
            samples = len(X)
            for frac in split:
                it_count = int(frac * samples)
                split_data.append((X[start_idx:start_idx + it_count],
                                   Y[start_idx:start_idx + it_count]))
                start_idx += it_count
            return split_data

        else:
            raise ValueError(
                "[GenericDatasource] Spliting values have to equal 1.0")

    def _run_dir(self, path):

        dlist = []
        files = os.listdir(path)
        X, Y = ([], [])

        if self.multi_dir:
            for f in files:
                item = os.path.join(path, f)
                if os.path.isdir(item):
                    dlist.append(item)

            mdata = multiprocess_run(self._run_multiprocess,
                                     tuple(),
                                     dlist,
                                     self._cpu_count,
                                     self._pbar,
                                     step_size=1,
                                     output_dim=2,
                                     txt_label='directories',
                                     verbose=self._verbose)

        else:
            mdata = self._load_metadata_from_dir(self.path)

        X.extend(mdata[0])  #samples
        Y.extend(mdata[1])  #labels

        X, Y = self._shuffle(X, Y)
        return X, Y

    def _shuffle(self, X, Y):
        #Shuffle samples and labels maintaining relative order
        combined = list(zip(X, Y))
        random.shuffle(combined)
        X[:], Y[:] = zip(*combined)

        return X, Y

    def split_metadata(self, split, data=None):
        """
        Returns all metadata split into N sets, defined by the spliting tuples
        
        @param data <tuple>: (X,Y) if provided, split this sequence. Else, split full metadata
        """
        if data is None:
            return self._split_data(split, self.X, self.Y)
        elif len(data) == 2:
            return self._split_data(split, data[0], data[1])
        else:
            return None

    def load_metadata(self, metadata_file='metadata.pik'):
        """
        Iterates over data patches and creates an instance of a GenericImage subclass for each one
        Returns a tuples of lists (X,Y): X instances of GenericImage subclasses, Y labels;

        OBS: Dataset metadata is shuffled once here. Random sample generation is done during training.
        """

        X, Y = (None, None)
        reload_data = False
        reshuffle = False

        if self._cache.checkFileExistence('split_ratio.pik'):
            split = self._cache.load('split_ratio.pik')
            if self._config.split != split:
                #Dump old data
                reshuffle = True
                if not self.X is None or not self.Y is None:
                    del (self.X)
                    del (self.Y)
                    self.X = None
                    self.Y = None

                if self._config.info:
                    print(
                        "Previous split ratio {} is different from requested one {}. Metadata will be reshuffled."
                        .format(split, self._config.split))

        if self._cache.checkFileExistence(metadata_file) and not reload_data:
            try:
                X, Y, name = self._cache.load(metadata_file)
            except ValueError:
                name = ''
                reload_data = True
            if name != self.name:
                reload_data = True

            if not reload_data and not reshuffle and self._verbose > 0:
                print(
                    "[GenericDatasource] Loaded split data cache. Used previously defined splitting."
                )
        else:
            reload_data = True

        if reshuffle:
            X, Y = self._shuffle(X, Y)

        if reload_data:
            X, Y = self._run_dir(self.path)

        if reload_data or reshuffle:
            self._cache.dump((X, Y, self.name), metadata_file)
            self._cache.dump(tuple(self._config.split), 'split_ratio.pik')

        self.X = X.copy()
        self.Y = Y.copy()
        return X, Y

    def load_data(self, split=None, keepImg=False, data=None):
        """
        Actually reads images and returns data ready for training
        Returns two tuples of NP arrays (X,Y): X data points, Y labels;

        @param split <tuple>: items are spliting fractions

        If a spliting ratio is provided, return a list of tuples of size at most 3:
        1 - Train;
        2 - Validation;
        3 - Test;
        
        @param keepImg <bool>: Keep image data in memory
        @param data <tuple>: metadata defining images to load. If not provided, full dataset is used.
        """

        if data is None and (self.X is None or self.Y is None):
            if self._verbose > 0:
                print("[GenericDatasource] Metadata not ready, loading...")
            self.load_metadata()

        #Which data to use?
        X, Y = None, None
        if data is None:
            X = self.X
            Y = self.Y
        else:
            X, Y = data

        if self._config.pred_size > 0:
            samples = self._config.pred_size
        else:
            samples = len(X)
        y = np.array(Y[:samples], dtype=np.int32)
        if not self._config.tdim is None and len(self._config.tdim) == 2:
            img_dim = tuple(self._config.tdim) + (3, )
        else:
            dataset_dim = self.get_dataset_dimensions(X)[0]
            img_dim = dataset_dim[1:]
        X_data = np.zeros(shape=(samples, ) + img_dim, dtype=np.float32)

        counter = 0
        futures = []

        executor = concurrent.futures.ThreadPoolExecutor(max_workers=7)
        for i in range(samples):
            futures.append(
                executor.submit(X[i].readImage, keepImg, img_dim,
                                self._verbose))

        if self._pbar:
            l = tqdm(desc="Reading images...", total=samples, position=0)
        elif self._config.info:
            print("Reading images...")

        #for future in concurrent.futures.as_completed(futures):
        for i in range(samples):
            X_data[i] = futures[i].result()
            if self._pbar:
                l.update(1)
            elif self._verbose > 0:
                print(".", end='')

        if self._pbar:
            l.close()
        elif self._verbose > 0:
            print('\n')

        if split is None:
            return (X_data, y)
        else:
            return self._split_data(split, X_data, y)

    def sample_metadata(self, k):
        """
        Produces a sample of the full metadata with k items. Returns a cached sample if one exists

        @param k <int>: total of samples
        @param k <float>: percentile of the whole dataset

        Return:
        - tuple (X,Y): X an Y have k elements
        """

        reload_data = False
        s_x, s_y = (None, None)
        if self._cache.checkFileExistence('sampled_metadata.pik'):
            try:
                s_x, s_y, name = self._cache.load('sampled_metadata.pik')
            except ValueError:
                name = ''
                reload_data = True
            if name != self.name:
                reload_data = True

            #Check if we have the desired number of items
            if k <= 1.0:
                k = int(k * len(self.X))
            else:
                k = int(k)
            if k != len(s_x):
                if self._config.info:
                    print(
                        "Saved samples are different from requested ({} x {}). Resampling..."
                        .format(k, len(s_x)))
                reload_data = True

            if not reload_data and self._verbose > 0:
                print(
                    "[GenericDatasource] Loaded split sampled data cache. Used previously defined splitting."
                )
        else:
            reload_data = True

        if reload_data and (self.X is None or self.Y is None):
            if self._config.verbose > 1:
                print("[GenericDatasource] Run load_metadata first!")
            return None

        if reload_data:
            if k <= 1.0:
                k = int(k * len(self.X))
            else:
                k = int(k)

            samples = np.random.choice(range(len(self.X)), k, replace=False)

            s_x = [self.X[s] for s in samples]
            s_y = [self.Y[s] for s in samples]

        #Save last generated sample
        self._cache.dump((s_x, s_y, self.name), 'sampled_metadata.pik')
        return (s_x, s_y)
Esempio n. 21
0
    def configure_sets(self):
        """
        Creates the initial sets: training (X,Y); example pool; validation set; test set

        All sets are kept as NP arrays
        """
        X,Y = self._ds.load_metadata()

        #Use a sample of the metadata if so instructed
        if self._config.sample != 1.0:
            X,Y = self._ds.sample_metadata(self._config.sample)
            self._ds.check_paths(X,self._config.predst)

        if self._config.balance:
            X,Y = self._balance_classes(X,Y)
            if self._config.info:
                print("[ALTrainer] Using a balanced initial dataset for AL ({} total elements).".format(len(X)))
        elif self._config.info:
            print("[ALTrainer] Using an UNBALANCED initial dataset for AL ({} total elements).".format(len(X)))
            
        #Test set is extracted from the last items and is not changed for the whole run
        t_idx = int(self._config.split[-1:][0] * len(X))
        self.test_x = X[- t_idx:]
        self.test_y = Y[- t_idx:]

        self.pool_x = X[:-t_idx]
        self.pool_y = Y[:-t_idx]

        #Initial training set will be choosen at random from pool
        cache_m = CacheManager()
        if self._config.load_train and not self._config.balance:
            train_idx = cache_m.load('initial_train.pik')
            if not train_idx is None and self._config.info:
                print("[ALTrainer] Using initial training set from {}. This is DANGEROUS. Use the metadata correspondent to the initial set.".format(initial_train))
            
        else:
            if not self._config.load_train and self._config.balance and self._config.info:
                print("[ALTrainer] Dataset balancing and initial train set loading not possible at the same time.")
                
            train_idx = np.random.choice(len(self.pool_x),self._config.init_train,replace=False)
            cache_m.dump(train_idx,'initial_train.pik')
            
        pool_ar_x = np.asarray(self.pool_x)
        pool_ar_y = np.asarray(self.pool_y)
        self.train_x = pool_ar_x[train_idx]
        self.train_y = pool_ar_y[train_idx]

        #Remove choosen elements from the pool
        self.pool_x = np.delete(pool_ar_x,train_idx)
        self.pool_y = np.delete(pool_ar_y,train_idx)
        del(pool_ar_x)
        del(pool_ar_y)
        
        #Initial validation set - keeps the same split ratio for train/val as defined in the configuration
        val_samples = int((self._config.init_train*self._config.split[1])/self._config.split[0])
        val_samples = max(val_samples,100)
        val_idx = np.random.choice(self.pool_x.shape[0],val_samples,replace=False)
        self.val_x = self.pool_x[val_idx]
        self.val_y = self.pool_y[val_idx]
        self.pool_x = np.delete(self.pool_x,val_idx)
        self.pool_y = np.delete(self.pool_y,val_idx)
Esempio n. 22
0
class VGG16(GenericModel):
    """
    Implements abstract methods from GenericModel.
    Producess a VGG16 model as implemented by Keras, with convolutional layers
    FC layers are substituted by Conv2D, as defined in:
    https://github.com/ALSM-PhD/quip_classification/blob/master/NNFramework_TF/sa_networks/vgg.py
    """
    def __init__(self, config, ds, name=None):
        super().__init__(config, ds, name=name)
        if name is None:
            self.name = "VGG16_A1"
        self._modelCache = "{0}-model.h5".format(self.name)
        self._weightsCache = "{0}-weights.h5".format(self.name)
        self._mgpu_weightsCache = "{0}-mgpu-weights.h5".format(self.name)
        self.cache_m = CacheManager()
        self.cache_m.registerFile(
            os.path.join(config.model_path, self._modelCache),
            self._modelCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._weightsCache),
            self._weightsCache)
        self.cache_m.registerFile(
            os.path.join(config.weights_path, self._mgpu_weightsCache),
            self._mgpu_weightsCache)

    def get_model_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._modelCache)

    def get_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._weightsCache)

    def get_mgpu_weights_cache(self):
        """
        Returns path to model cache
        """
        return self.cache_m.fileLocation(self._mgpu_weightsCache)

    def build(self, **kwargs):
        """
        Returns a VGG 16 model instance, final fully-connected layers are substituted by Conv2Ds
        
        @param pre_trained <boolean>: returned model should be pre-trained or not
        """
        width, height, channels = self._check_input_shape()

        if backend.image_data_format() == 'channels_first':
            input_shape = (channels, height, width)
        else:
            input_shape = (height, width, channels)

        if 'data_size' in kwargs:
            self.data_size = kwargs['data_size']

        self.cache_m = CacheManager()

        model = self._build_architecture(input_shape)

        #Check if previous training and LR is saved, if so, use it
        lr_cache = "{0}_learning_rate.txt".format(self.name)
        self.cache_m.registerFile(os.path.join(self._config.cache, lr_cache),
                                  lr_cache)
        l_rate = 0.0005
        if os.path.isfile(self.cache_m.fileLocation(
                lr_cache)) and not self._config.new_net:
            l_rate = float(self.cache_m.read(lr_cache))
            if self._config.info:
                print("Found previous learning rate: {0}".format(l_rate))

        sgd = optimizers.SGD(lr=l_rate,
                             decay=1.5e-4,
                             momentum=0.9,
                             nesterov=True)
        #adam = optimizers.Adam(lr = l_rate)

        #Return parallel model if multiple GPUs are available
        parallel_model = None

        if self._config.gpu_count > 1:
            with tf.device('/cpu:0'):
                model.compile(loss='categorical_crossentropy',
                              optimizer=sgd,
                              metrics=['accuracy'])

            parallel_model = multi_gpu_model(model,
                                             gpus=self._config.gpu_count)
            parallel_model.compile(
                loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )
        else:
            model.compile(
                loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )

        self.single = model
        self.parallel = parallel_model

        return (model, parallel_model)

    def _build_architecture(self, input_shape):
        original_vgg16 = vgg16.VGG16(
            weights=self.cache_m.fileLocation('vgg16_weights_notop.h5'),
            include_top=False,
            input_shape=input_shape)

        #Freeze initial layers, except for the last 3:
        #for layer in original_vgg16.layers[:-2]:
        #    layer.trainable = False

        model = Sequential()
        model.add(original_vgg16)
        model.add(
            Convolution2D(4096, (7, 7),
                          strides=1,
                          padding='valid',
                          kernel_initializer='he_normal'))
        model.add(Activation('relu'))
        model.add(Dropout(0.75))
        model.add(
            Convolution2D(4096, (1, 1),
                          strides=1,
                          padding='valid',
                          kernel_initializer='he_normal'))
        model.add(Activation('relu'))
        model.add(Dropout(0.75))
        model.add(
            Convolution2D(self._ds.nclasses, (1, 1),
                          strides=1,
                          padding='valid',
                          kernel_initializer='he_normal'))
        model.add(Flatten())
        model.add(Dense(self._ds.nclasses))
        model.add(Activation('softmax'))

        return model
Esempio n. 23
0
def main_exec(config):
    """
    Main execution line. Dispatch processes according to parameter groups.
    Multiple processes here prevent main process from consuming too much memory.
    """

    if not os.path.isdir(config.bdir):
        os.mkdir(config.bdir)

    if not os.path.isdir(config.weights_path):
        os.mkdir(config.weights_path)

    if not os.path.isdir(config.model_path):
        os.mkdir(config.model_path)

    if not os.path.isdir(config.cache):
        os.mkdir(config.cache)

    if not os.path.isdir(config.logdir):
        os.mkdir(config.logdir)

    if config.preprocess:
        if config.img_type is None:
            imgt = img_types
        else:
            imgt = config.img_type

        if config.multiprocess:
            proc = Process(target=Preprocess.preprocess_data,
                           args=(config, imgt))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            Preprocess.preprocess_data(config, imgt)

    if config.train:
        if not os.path.isdir(config.weights_path):
            os.mkdir(config.weights_path)
        if not os.path.isdir(config.model_path):
            os.mkdir(config.model_path)

        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = ctx.Process(target=GenericTrainer.run_training,
                               args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            GenericTrainer.run_training(config, None)

    if config.al:
        if not os.path.isdir(config.weights_path):
            os.mkdir(config.weights_path)
        if not os.path.isdir(config.model_path):
            os.mkdir(config.model_path)

        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = ctx.Process(target=ALTrainer.run_training,
                               args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            ts = importlib.import_module('Trainers', config.strategy)
            getattr(ts, config.strategy).run_training(config, None)

    if config.pred:
        if config.multiprocess:
            ctx = mp.get_context('spawn')
            cache_m = CacheManager()
            proc = Process(target=Predictions.run_prediction,
                           args=(config, cache_m.getLocations()))
            proc.start()
            proc.join()

            if proc.exitcode != Exitcodes.ALL_GOOD:
                print(
                    "System did not end well. Check logs or enhace verbosity level."
                )
                sys.exit(proc.exitcode)
        else:
            Predictions.run_prediction(config, None)

    if config.postproc:
        pass

    if config.runtest:
        if config.tmode == 0:
            pass
        elif config.tmode == 1:
            #Run train test
            TrainTest.run(config)
        elif config.tmode == 2:
            DatasourcesTest.run(config)
        elif config.tmode == 3:
            PredictionTest.run(config)
        elif config.tmode == 4:
            ActiveLearningTest.run(config)

    if not (config.preprocess or config.train or config.postproc or config.pred
            or config.runtest):
        print(
            "The problem begins with choice: preprocess, train, postprocess or predict"
        )
Esempio n. 24
0
    def build(self, **kwargs):
        """
        Returns a VGG 16 model instance, final fully-connected layers are substituted by Conv2Ds
        
        @param pre_trained <boolean>: returned model should be pre-trained or not
        """
        width, height, channels = self._check_input_shape()

        if backend.image_data_format() == 'channels_first':
            input_shape = (channels, height, width)
        else:
            input_shape = (height, width, channels)

        if 'data_size' in kwargs:
            self.data_size = kwargs['data_size']

        self.cache_m = CacheManager()

        model = self._build_architecture(input_shape)

        #Check if previous training and LR is saved, if so, use it
        lr_cache = "{0}_learning_rate.txt".format(self.name)
        self.cache_m.registerFile(os.path.join(self._config.cache, lr_cache),
                                  lr_cache)
        l_rate = 0.0005
        if os.path.isfile(self.cache_m.fileLocation(
                lr_cache)) and not self._config.new_net:
            l_rate = float(self.cache_m.read(lr_cache))
            if self._config.info:
                print("Found previous learning rate: {0}".format(l_rate))

        sgd = optimizers.SGD(lr=l_rate,
                             decay=1.5e-4,
                             momentum=0.9,
                             nesterov=True)
        #adam = optimizers.Adam(lr = l_rate)

        #Return parallel model if multiple GPUs are available
        parallel_model = None

        if self._config.gpu_count > 1:
            with tf.device('/cpu:0'):
                model.compile(loss='categorical_crossentropy',
                              optimizer=sgd,
                              metrics=['accuracy'])

            parallel_model = multi_gpu_model(model,
                                             gpus=self._config.gpu_count)
            parallel_model.compile(
                loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )
        else:
            model.compile(
                loss='categorical_crossentropy',
                optimizer=sgd,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )

        self.single = model
        self.parallel = parallel_model

        return (model, parallel_model)
Esempio n. 25
0
    def _build(self, **kwargs):
        """
        @param pre_trained <boolean>: returned model should be pre-trained or not
        @param data_size <int>: size of the training dataset
        """
        width, height, channels = self._check_input_shape()

        if 'data_size' in kwargs:
            self.data_size = kwargs['data_size']

        if 'training' in kwargs:
            training = kwargs['training']
        else:
            training = True

        if 'feature' in kwargs:
            feature = kwargs['feature']
        else:
            feature = False

        if backend.image_data_format() == 'channels_first':
            input_shape = (channels, height, width)
        else:
            input_shape = (height, width, channels)

        self.cache_m = CacheManager()

        model = self._build_architecture(input_shape, training, feature)

        #Check if previous training and LR is saved, if so, use it
        lr_cache = "{0}_learning_rate.txt".format(self.name)
        self.cache_m.registerFile(os.path.join(self._config.cache, lr_cache),
                                  lr_cache)
        l_rate = 0.0005
        if os.path.isfile(self.cache_m.fileLocation(
                lr_cache)) and not self._config.new_net:
            l_rate = float(self.cache_m.read(lr_cache))
            if self._config.info:
                print("Found previous learning rate: {0}".format(l_rate))

        #opt = optimizers.SGD(lr=l_rate, decay=1.5e-4, momentum=0.9, nesterov=True)
        #opt = optimizers.Adam(lr = l_rate)
        opt = optimizers.Adadelta()

        #Return parallel model if multiple GPUs are available
        parallel_model = None

        if self._config.gpu_count > 1:
            with tf.device('/cpu:0'):
                model.compile(loss='categorical_crossentropy',
                              optimizer=opt,
                              metrics=['accuracy'])

            parallel_model = multi_gpu_model(model,
                                             gpus=self._config.gpu_count)
            parallel_model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )
        else:
            model.compile(
                loss='categorical_crossentropy',
                optimizer=opt,
                metrics=['accuracy'],
                #options=p_opt,
                #run_metadata=p_mtd
            )

        return (model, parallel_model)
Esempio n. 26
0
    def run_test(self, model, x_test=None, y_test=None, load_full=True):
        """
        This should be executed after a model has been trained
        """

        cache_m = CacheManager()
        split = None
        if os.path.isfile(cache_m.fileLocation('split_ratio.pik')):
            split = cache_m.load('split_ratio.pik')
        else:
            print(
                "[Predictor] A previously trained model and dataset should exist. No previously defined spliting found."
            )
            return Exitcodes.RUNTIME_ERROR

        #Priority is for given data as parameters. If None is given, try to load metadata as configured
        if x_test is None or y_test is None:
            if self._config.testdir is None:
                #Load sampled data if required by command line
                if self._config.sample < 1.0:
                    _, _, (x_test, y_test) = self._ds.split_metadata(
                        split=split,
                        data=self._ds.sample_metadata(self._config.sample))
                else:
                    _, _, (x_test, y_test) = self._ds.split_metadata(split)
            else:
                x_test, y_test = self._ds._run_dir(self._config.testdir)

        if self._config.verbose > 0:
            unique, count = np.unique(y_test, return_counts=True)
            l_count = dict(zip(unique, count))
            if len(unique) > 2:
                print("Test items:")
                print("\n".join([
                    "label {0}: {1} items".format(key, l_count[key])
                    for key in unique
                ]))
            else:
                if not 1 in l_count:
                    l_count[1] = 0
                print(
                    "Test labels: {0} are 0; {1} are 1;\n - {2:.2f} are positives"
                    .format(l_count[0], l_count[1],
                            (l_count[1] / (l_count[0] + l_count[1]))))
            print("Test set: {} items".format(len(y_test)))

        X, Y = self._ds.load_data(data=(x_test, y_test), keepImg=self._keep)
        if self._config.verbose > 1:
            print("Y original ({1}):\n{0}".format(Y, Y.shape))
        Y = to_categorical(Y, self._ds.nclasses)

        # session setup
        sess = K.get_session()
        ses_config = tf.ConfigProto(
            device_count={
                "CPU": self._config.cpu_count,
                "GPU": self._config.gpu_count
            },
            intra_op_parallelism_threads=self._config.cpu_count
            if self._config.gpu_count == 0 else self._config.gpu_count,
            inter_op_parallelism_threads=self._config.cpu_count
            if self._config.gpu_count == 0 else self._config.gpu_count,
            log_device_placement=True if self._verbose > 1 else False)
        sess.config = ses_config
        K.set_session(sess)

        #During test phase multi-gpu mode is not used (maybe done latter)
        if self._ensemble:
            #Weights should be loaded during ensemble build
            if hasattr(model, 'build_ensemble'):
                pred_model = model.build_ensemble(training=False, npfile=True)
            else:
                if self._config.info:
                    print(
                        '[Predictor] Model not prepared to build ensembles, implement or choose other model'
                    )
                return None
        elif load_full and os.path.isfile(model.get_model_cache()):
            try:
                pred_model = load_model(model.get_model_cache())
                if self._config.info:
                    print("Model loaded from: {0}".format(
                        model.get_model_cache()))
            except ValueError:
                pred_model, _ = model.build(training=False, pre_load_w=False)
                pred_model.load_weights(model.get_weights_cache())
                if self._config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_weights_cache()))
        elif os.path.isfile(model.get_weights_cache()):
            pred_model, _ = model.build(training=False, pre_load_w=False)
            pred_model.load_weights(model.get_weights_cache())
            if self._config.info:
                print("Model weights loaded from: {0}".format(
                    model.get_weights_cache()))

        else:
            if self._config.info:
                print("No trained model or weights file found")
            return None

        bsize = self._config.batch_size
        stp = round((len(X) / bsize) + 0.5)

        image_generator = ImageDataGenerator(
            samplewise_center=self._config.batch_norm,
            samplewise_std_normalization=self._config.batch_norm)

        if self._ensemble:
            if not self._config.tdim is None:
                fix_dim = self._config.tdim
            else:
                fix_dim = self._ds.get_dataset_dimensions()[0][
                    1:]  #Only smallest image dimensions matter here
            test_generator = SingleGenerator(
                dps=(X, Y),
                classes=self._ds.nclasses,
                dim=fix_dim,
                batch_size=self._config.batch_size,
                image_generator=image_generator,
                extra_aug=self._config.augment,
                shuffle=False,
                verbose=self._verbose,
                input_n=self._config.emodels)
        else:
            test_generator = image_generator.flow(x=X,
                                                  y=Y,
                                                  batch_size=bsize,
                                                  shuffle=False)

        if self._config.progressbar:
            l = tqdm(desc="Making predictions...", total=stp)

        Y_pred = np.zeros((len(X), self._ds.nclasses), dtype=np.float32)
        for i in range(stp):
            start_idx = i * bsize
            example = test_generator.next()
            Y_pred[start_idx:start_idx + bsize] = pred_model.predict_on_batch(
                example[0])
            if self._config.progressbar:
                l.update(1)
            elif self._config.info:
                print("Batch prediction ({0}/{1})".format(i, stp))
            if self._config.verbose > 1:
                if not np.array_equal(Y[start_idx:start_idx + bsize],
                                      example[1]):
                    print(
                        "Datasource label ({0}) and batch label ({1}) differ".
                        format(Y[start_idx:start_idx + bsize], example[1]))

        del (X)
        del (test_generator)

        if self._config.progressbar:
            l.close()

        y_pred = np.argmax(Y_pred, axis=1)
        expected = np.argmax(Y, axis=1)

        if self._config.verbose > 0:
            if self._config.verbose > 1:
                np.set_printoptions(threshold=np.inf)
                print("Predicted probs ({1}):\n{0}".format(
                    Y_pred, Y_pred.shape))
            #print("Y ({1}):\n{0}".format(Y,Y.shape))
            print("expected ({1}):\n{0}".format(expected, expected.shape))
            print("Predicted ({1}):\n{0}".format(y_pred, y_pred.shape))

        #Save predictions
        cache_m.dump((expected, Y_pred, self._ds.nclasses), 'test_pred.pik')

        #Output metrics
        print_prediction(self._config)