Exemple #1
0
def preprocess_data(config, img_types):
    """
    Main function in preprocessing.

    Works through estipulated configuration.
    """

    #Check SRC and DST directories
    if not os.path.exists(config.presrc):
        if config.verbose > 0:
            print("[Preprocess] No such directory: {0}".format(config.presrc))
        sys.exit(Exitcodes.PATH_ERROR)
    if not os.path.exists(config.predst):
        os.makedirs(config.predst)

    #If SRC dir has already been scanned, no need to redo:
    cache_m = CacheManager(verbose=config.verbose)
    datatree = None
    if config.tcga:
        datatree = cache_m.load('tcga.pik')
        if datatree is None:
            datatree = TCGAMerger.Merger(config.presrc, config.verbose)
            cache_m.dump(datatree, 'tcga.pik')
    else:
        if cache_m.checkFileExistence('datatree.pik'):
            imglist, lablist, path = cache_m.load('datatree.pik')
            if path == config.presrc:
                datatree = GenericData.ImageSource((imglist, lablist),
                                                   config.presrc, img_types)

        if datatree is None:
            datatree = GenericData.ImageSource(None, config.presrc, img_types)
            cache_m.dump(
                (datatree.getData(), datatree.getLabelsList(), config.presrc),
                'datatree.pik')

    #Produce tiles from input images
    #TODO: implement parallel tiling, choose between multiprocess tiling (multiple images processed in parallel) or single process (one image
    #at a time, but work divided in threads
    if config.tile:
        if config.multiprocess:
            multiprocess_run(make_singleprocesstiling, (config, ),
                             datatree,
                             step_size=20)
            #make_multiprocesstiling(datatree,config)
        else:
            make_singleprocesstiling(datatree, config)
    elif not config.normalize is None:
        make_singleprocessnorm(datatree, config)
    def get_dataset_dimensions(self, X=None):
        """
        Returns the dimensions of the images in the dataset. It's possible to have different image dimensions.
        WARNING: big datasets will take forever to run. For now, checks a sample of the images.
        TODO: Reimplement this function to be fully parallel (threads in case).

        Return: SORTED list of tuples (# samples,width,height,channels)
        """

        cache_m = CacheManager()
        reload_data = False
        if cache_m.checkFileExistence('data_dims.pik'):
            try:
                dims, name = cache_m.load('data_dims.pik')
            except ValueError:
                reload_data = True
            if name != self.name:
                reload_data = True
        else:
            reload_data = True

        if reload_data:
            dims = set()
            if X is None and self.X is None:
                return None
            elif X is None:
                X = self.X

            samples = len(X)
            if self._config.info:
                print(
                    "Checking a sample of dataset images for different dimensions..."
                )

            s_number = int(0.02 * samples)
            upper_limit = 5000 if s_number > 5000 else s_number
            for seg in random.sample(X, upper_limit):
                dims.add((samples, ) + seg.getImgDim())
            cache_m.dump((dims, self.name), 'data_dims.pik')

        l = list(dims)
        l.sort()
        return l
Exemple #3
0
    def _save_weights(self, model, single, parallel, clear_sess, save_numpy):
        #Save weights for single tower model and for multigpu model (if defined)
        cache_m = CacheManager()
        if self._config.info:
            print("Saving weights, this could take a while...")
        if save_numpy and hasattr(model, 'get_npweights_cache'):
            np.save(model.get_npweights_cache(), single.get_weights())
        else:
            single.save_weights(model.get_weights_cache())
            single.save(model.get_model_cache())

        if not parallel is None and not model.get_mgpu_weights_cache() is None:
            if save_numpy and hasattr(model, 'get_npmgpu_weights_cache'):
                np.save(model.get_npmgpu_weights_cache(),
                        parallel.get_weights())
            else:
                parallel.save_weights(model.get_mgpu_weights_cache())
        cache_m.dump(tuple(self._config.split), 'split_ratio.pik')

        if clear_sess:
            K.clear_session()

        return Exitcodes.ALL_GOOD
Exemple #4
0
    def configure_sets(self):
        """
        Creates the initial sets: training (X,Y); example pool; validation set; test set

        All sets are kept as NP arrays
        """
        X,Y = self._ds.load_metadata()

        #Use a sample of the metadata if so instructed
        if self._config.sample != 1.0:
            X,Y = self._ds.sample_metadata(self._config.sample)
            self._ds.check_paths(X,self._config.predst)

        if self._config.balance:
            X,Y = self._balance_classes(X,Y)
            if self._config.info:
                print("[ALTrainer] Using a balanced initial dataset for AL ({} total elements).".format(len(X)))
        elif self._config.info:
            print("[ALTrainer] Using an UNBALANCED initial dataset for AL ({} total elements).".format(len(X)))
            
        #Test set is extracted from the last items and is not changed for the whole run
        t_idx = int(self._config.split[-1:][0] * len(X))
        self.test_x = X[- t_idx:]
        self.test_y = Y[- t_idx:]

        self.pool_x = X[:-t_idx]
        self.pool_y = Y[:-t_idx]

        #Initial training set will be choosen at random from pool
        cache_m = CacheManager()
        if self._config.load_train and not self._config.balance:
            train_idx = cache_m.load('initial_train.pik')
            if not train_idx is None and self._config.info:
                print("[ALTrainer] Using initial training set from {}. This is DANGEROUS. Use the metadata correspondent to the initial set.".format(initial_train))
            
        else:
            if not self._config.load_train and self._config.balance and self._config.info:
                print("[ALTrainer] Dataset balancing and initial train set loading not possible at the same time.")
                
            train_idx = np.random.choice(len(self.pool_x),self._config.init_train,replace=False)
            cache_m.dump(train_idx,'initial_train.pik')
            
        pool_ar_x = np.asarray(self.pool_x)
        pool_ar_y = np.asarray(self.pool_y)
        self.train_x = pool_ar_x[train_idx]
        self.train_y = pool_ar_y[train_idx]

        #Remove choosen elements from the pool
        self.pool_x = np.delete(pool_ar_x,train_idx)
        self.pool_y = np.delete(pool_ar_y,train_idx)
        del(pool_ar_x)
        del(pool_ar_y)
        
        #Initial validation set - keeps the same split ratio for train/val as defined in the configuration
        val_samples = int((self._config.init_train*self._config.split[1])/self._config.split[0])
        val_samples = max(val_samples,100)
        val_idx = np.random.choice(self.pool_x.shape[0],val_samples,replace=False)
        self.val_x = self.pool_x[val_idx]
        self.val_y = self.pool_y[val_idx]
        self.pool_x = np.delete(self.pool_x,val_idx)
        self.pool_y = np.delete(self.pool_y,val_idx)
Exemple #5
0
    def run(self):
        """
        Coordenates the AL process
        """
        from keras import backend as K
        import time
        from datetime import timedelta
        
        #Loaded CNN model and Datasource
        model = self.load_modules()
        self._rex = self._rex.format(model.name)
        #Define initial sets
        self.configure_sets()
        #AL components
        cache_m = CacheManager()
        predictor = Predictor(self._config,keepImg=True)
        function = None
        
        if not self._config.ac_function is None:
            acq = importlib.import_module('AL','AcquisitionFunctions')
            function = getattr(acq,self._config.ac_function)
        else:
            print("You should specify an acquisition function")
            sys.exit(Exitcodes.RUNTIME_ERROR)

        stime = None
        etime = None
        sw_thread = None
        end_train = False
        for r in range(self._config.acquisition_steps):
            if self._config.info:
                print("[ALTrainer] Starting acquisition step {0}/{1}".format(r+1,self._config.acquisition_steps))
                stime = time.time()

            #Save current dataset and report partial result (requires multi load for reading)
            fid = 'al-metadata-{1}-r{0}.pik'.format(r,model.name)
            cache_m.registerFile(os.path.join(self._config.logdir,fid),fid)
            cache_m.dump(((self.train_x,self.train_y),(self.val_x,self.val_y),(self.test_x,self.test_y)),fid)
                
            sw_thread = self.train_model(model,(self.train_x,self.train_y),(self.val_x,self.val_y))            
            
            if r == (self._config.acquisition_steps - 1) or not self.acquire(function,model,acquisition=r,sw_thread=sw_thread):
                if self._config.info:
                    print("[ALTrainer] No more acquisitions are in order")
                end_train = True
                    
            #Some models may take too long to save weights
            if not sw_thread is None and sw_thread.is_alive():
                if self._config.info:
                    print("[ALTrainer] Waiting for model weights...")
                sw_thread.join()
                    
            #Set load_full to false so dropout is disabled
            predictor.run(self.test_x,self.test_y,load_full=False)
            
            #Attempt to free GPU memory
            K.clear_session()
            
            if self._config.info:
                etime = time.time()
                td = timedelta(seconds=(etime-stime))
                print("Acquisition step took: {0}".format(td))
                
            if end_train:
                return None
Exemple #6
0
    def run(self):
        """
        Coordenates the AL process
        """
        from keras import backend as K
        import time
        from datetime import timedelta

        #Loaded CNN model and Datasource
        model = self.load_modules()
        self._rex = self._rex.format(model.name)
        #Define initial sets
        self.configure_sets()
        #AL components
        cache_m = CacheManager()
        predictor = Predictor(self._config, keepImg=True, build_ensemble=True)
        function = None

        if not self._config.ac_function is None:
            acq = importlib.import_module('AL', 'AcquisitionFunctions')
            function = getattr(acq, self._config.ac_function)
        else:
            print("You should specify an acquisition function")
            sys.exit(Exitcodes.RUNTIME_ERROR)

        stime = None
        etime = None
        end_train = False
        self._initializer(self._config.gpu_count, self._config.cpu_count)

        for r in range(self._config.acquisition_steps):
            if self._config.info:
                print("[EnsembleTrainer] Starting acquisition step {0}/{1}".
                      format(r + 1, self._config.acquisition_steps))
                stime = time.time()

            #Save current dataset and report partial result (requires multi load for reading)
            fid = 'al-metadata-{1}-r{0}.pik'.format(r, model.name)
            cache_m.registerFile(os.path.join(self._config.logdir, fid), fid)
            cache_m.dump(
                ((self.train_x, self.train_y), (self.val_x, self.val_y),
                 (self.test_x, self.test_y)), fid)

            self._print_stats((self.train_x, self.train_y),
                              (self.val_x, self.val_y))
            sw_thread = None
            for m in range(self._config.emodels):
                #Some models may take too long to save weights
                if not sw_thread is None:
                    if self._config.info:
                        print("[EnsembleTrainer] Waiting for model weights.",
                              end='')
                    while True:
                        pst = '.'
                        if sw_thread[-1].is_alive():
                            if self._config.info:
                                pst = "{}{}".format(pst, '.')
                                print(pst, end='')
                            sw_thread[-1].join(60.0)
                        else:
                            print('')
                            break

                if hasattr(model, 'register_ensemble'):
                    model.register_ensemble(m)
                else:
                    print(
                        "Model not ready for ensembling. Implement register_ensemble method"
                    )
                    raise AttributeError

                if self._config.info:
                    print(
                        "[EnsembleTrainer] Starting model {} training".format(
                            m))

                st = self.train_model(model, (self.train_x, self.train_y),
                                      (self.val_x, self.val_y),
                                      set_session=False,
                                      stats=False,
                                      summary=False,
                                      clear_sess=True,
                                      save_numpy=True)
                if sw_thread is None:
                    sw_thread = [st]
                else:
                    sw_thread.append(st)

            if r == (self._config.acquisition_steps - 1) or not self.acquire(
                    function, model, acquisition=r, sw_thread=sw_thread):
                if self._config.info:
                    print("[ALTrainer] No more acquisitions are in order")
                end_train = True

            #Set load_full to false so dropout is disabled
            predictor.run(self.test_x, self.test_y, load_full=False)

            #Attempt to free GPU memory
            K.clear_session()

            if self._config.info:
                etime = time.time()
                td = timedelta(seconds=(etime - stime))
                print("Acquisition step took: {0}".format(td))

            if end_train:
                return None
Exemple #7
0
def ensemble_varratios(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        emodels = config.emodels
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[ensemble_varratios] GenericModel is needed by ensemble_varratios. Set model kw argument"
        )
        return None

    if 'sw_thread' in kwargs:
        sw_thread = kwargs['sw_thread']
    else:
        sw_thread = None

    fidp = None
    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        if config.debug:
            fidp = 'al-probs-{1}-r{0}.pik'.format(r, config.ac_function)
            cache_m.registerFile(os.path.join(config.logdir, fidp), fidp)

    All_Dropout_Classes = np.zeros(shape=(data_size, 1))

    #If sw_thread was provided, we should check the availability of model weights
    if not sw_thread is None:
        for k in range(len(sw_thread)):
            if sw_thread[k].is_alive():
                print(
                    "Waiting ensemble model {} weights' to become available..."
                    .format(k))
                sw_thread[k].join()

    if pbar:
        l = tqdm(range(emodels),
                 desc="Ensemble member predictions",
                 position=0)
    else:
        if config.info:
            print("Starting Ensemble sampling...")
        l = range(emodels)

    #Keep probabilities for analysis
    all_probs = None
    if config.debug:
        all_probs = np.zeros(shape=(emodels, data_size, generator.classes))

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, emodels))

        model.register_ensemble(d)
        single, parallel = model.build(pre_load=False)

        if hasattr(model, 'get_npweights_cache'):
            spath = model.get_npweights_cache(add_ext=True)
            npfile = True
        else:
            spath = model.get_weights_cache()
            npfile = False

        if hasattr(model, 'get_npmgpu_weights_cache'):
            ppath = model.get_npmgpu_weights_cache(add_ext=True)
            npfile = True
        else:
            ppath = model.get_mgpu_weights_cache()
            npfile = False

        pred_model = _load_model_weights(config, single, spath, parallel,
                                         ppath, sw_thread, npfile)

        #Keep verbosity in 0 to gain speed
        proba = pred_model.predict_generator(generator,
                                             workers=5 * cpu_count,
                                             max_queue_size=100 * gpu_count,
                                             verbose=0)

        if config.debug:
            all_probs[d] = proba

        dropout_classes = proba.argmax(axis=-1)
        dropout_classes = np.array([dropout_classes]).T
        All_Dropout_Classes = np.append(All_Dropout_Classes,
                                        dropout_classes,
                                        axis=1)

    if verbose > 0:
        print("All dropout {0}:".format(All_Dropout_Classes.shape))
        for i in np.random.choice(All_Dropout_Classes.shape[0],
                                  100,
                                  replace=False):
            print("Predictions for image ({0}): {1}".format(
                i, All_Dropout_Classes[i]))

    Variation = np.zeros(shape=(data_size))

    for t in range(data_size):
        L = np.array([0])
        for d_iter in range(emodels):
            L = np.append(L, All_Dropout_Classes[t, d_iter + 1])
        Predicted_Class, Mode = mode(L[1:])
        v = np.array([1 - Mode / float(emodels)])
        Variation[t] = v

    if verbose > 1:
        print("Variation {0}:".format(data_size))
        for i in np.random.choice(data_size, 100, replace=False):
            print("Variation for image ({0}): {1}".format(i, Variation[i]))

    a_1d = Variation.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if config.debug:
        from .Common import debug_acquisition
        s_expected = generator.returnLabelsFromIndex(x_pool_index)
        #After transposition shape will be (classes,items,mc_dp)
        s_probs = all_probs[:emodels, x_pool_index].T
        debug_acquisition(s_expected, s_probs, generator.classes, cache_m,
                          config, fidp)

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's variation: {0}".format(a_1d[x_pool_index]))
        print("Maximum variation in pool: {0}".format(a_1d.max()))

    return x_pool_index
Exemple #8
0
def ensemble_bald(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        emodels = config.emodels
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if 'sw_thread' in kwargs:
        sw_thread = kwargs['sw_thread']
    else:
        sw_thread = None

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[ensemble_varratios] GenericModel is needed by ensemble_varratios. Set model kw argument"
        )
        return None

    #If sw_thread was provided, we should check the availability of model weights
    if not sw_thread is None:
        for k in range(len(sw_thread)):
            if sw_thread[k].is_alive():
                print(
                    "Waiting ensemble model {} weights' to become available..."
                    .format(k))
                sw_thread[k].join()

    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)

    All_Entropy_Dropout = np.zeros(shape=data_size)
    score_All = np.zeros(shape=(data_size, generator.classes))

    if pbar:
        l = tqdm(range(emodels),
                 desc="Ensemble member predictions",
                 position=0)
    else:
        if config.info:
            print("Starting ensemble sampling...")
        l = range(emodels)

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, emodels))

        model.register_ensemble(d)
        single, parallel = model.build(pre_load=False)

        if hasattr(model, 'get_npweights_cache'):
            spath = model.get_npweights_cache(add_ext=True)
            npfile = True
        else:
            spath = model.get_weights_cache()
            npfile = False

        if hasattr(model, 'get_npmgpu_weights_cache'):
            ppath = model.get_npmgpu_weights_cache(add_ext=True)
            npfile = True
        else:
            ppath = model.get_mgpu_weights_cache()
            npfile = False

        pred_model = _load_model_weights(config, single, spath, parallel,
                                         ppath, sw_thread, npfile)

        dropout_score = pred_model.predict_generator(generator,
                                                     workers=5 * cpu_count,
                                                     max_queue_size=100 *
                                                     gpu_count,
                                                     verbose=0)
        #computing G_X
        score_All = score_All + dropout_score

        #computing F_X
        dropout_score_log = np.log2(dropout_score)
        Entropy_Compute = -np.multiply(dropout_score, dropout_score_log)
        Entropy_Per_Dropout = np.sum(Entropy_Compute, axis=1)

        All_Entropy_Dropout = All_Entropy_Dropout + Entropy_Per_Dropout

    Avg_Pi = np.divide(score_All, emodels)
    Log_Avg_Pi = np.log2(Avg_Pi)
    Entropy_Avg_Pi = -np.multiply(Avg_Pi, Log_Avg_Pi)
    Entropy_Average_Pi = np.sum(Entropy_Avg_Pi, axis=1)

    G_X = Entropy_Average_Pi

    Average_Entropy = np.divide(All_Entropy_Dropout, emodels)

    F_X = Average_Entropy

    U_X = G_X - F_X

    # THIS FINDS THE MINIMUM INDEX
    # a_1d = U_X.flatten()
    # x_pool_index = a_1d.argsort()[-Queries:]

    a_1d = U_X.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's average entropy: {0}".format(
            a_1d[x_pool_index]))
        print("Maximum entropy in pool: {0}".format(a_1d.max()))

    return x_pool_index
Exemple #9
0
def km_uncert(bayesian_model, generator, data_size, **kwargs):
    """
    Cluster in K centroids and extract N samples from each cluster, based on maximum bayesian_varratios
    uncertainty.

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    sw_threads <thread Object>: if a thread object is passed, you must wait its conclusion before loading weights
    """
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    import importlib
    import copy
    import time
    from datetime import timedelta
    from Utils import CacheManager

    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        clusters = config.clusters
    else:
        return None

    if 'acquisition' in kwargs:
        acq = kwargs['acquisition']
    else:
        acq = config.acquisition_steps

    if 'model' in kwargs:
        model = kwargs['model']
    else:
        print(
            "[km_uncert] GenericModel is needed by km_uncert. Set model kw argument"
        )
        return None

    ## UNCERTAINTY CALCULATION FIRST
    #Any uncertainty function could be used
    n_config = copy.copy(config)
    n_config.acquire = data_size
    kwargs['config'] = n_config
    un_function = getattr(importlib.import_module('AL'), config.un_function)
    un_indexes = un_function(bayesian_model, generator, data_size, **kwargs)

    #Models that take to long to save weights might not have finished
    if 'sw_thread' in kwargs:
        if config.ffeat is None and kwargs['sw_thread'].is_alive():
            if config.info:
                print(
                    "[km_uncert] Waiting for model weights to become available..."
                )
            kwargs['sw_thread'].join()
    elif config.info:
        print(
            "[km_uncert] Weights thread not available...trying to load weights"
        )

    if not os.path.isfile(model.get_weights_cache()) and not os.path.isfile(
            model.get_mgpu_weights_cache()):
        if config.info:
            print("[km_uncert] No trained model or weights file found")
        return None

    if config.recluster > 0 and acq > 0 and (acq % config.recluster) != 0:
        km, acquired = cache_m.load('clusters.pik')
        if config.info:
            print("[km_uncert] Loaded clusters from previous acquisition")
            #TODO: REMOVE
            print("Previous cluster size: {};\nAcquired: {}".format(
                km.labels_.shape, acquired.shape))
        km.labels_ = np.delete(km.labels_, acquired)
    else:
        #Run feature extraction and clustering
        if hasattr(model, 'build_extractor'):
            single_m, parallel_m = model.build_extractor(training=False,
                                                         feature=True,
                                                         parallel=False)
        else:
            if config.info:
                print(
                    "[km_uncert] Model is not prepared to produce features. No feature extractor"
                )
            return None

        #Model can be loaded from previous acquisition train or from a fixed final model
        if gpu_count > 1 and not parallel_m is None:
            pred_model = parallel_m
            if not config.ffeat is None and os.path.isfile(config.ffeat):
                pred_model.load_weights(config.ffeat, by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        config.ffeat))
            else:
                pred_model.load_weights(model.get_mgpu_weights_cache(),
                                        by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_mgpu_weights_cache()))
        else:
            pred_model = single_m
            if not config.ffeat is None and os.path.isfile(config.ffeat):
                pred_model.load_weights(config.ffeat, by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        config.ffeat))
            else:
                pred_model.load_weights(model.get_weights_cache(),
                                        by_name=True)
                if config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_weights_cache()))

        #Extract features for all images in the pool
        if config.info:
            print("Starting feature extraction ({} batches)...".format(
                len(generator)))
        features = pred_model.predict_generator(generator,
                                                workers=4 * cpu_count,
                                                max_queue_size=100 * gpu_count,
                                                verbose=0)
        features = features.reshape(features.shape[0],
                                    np.prod(features.shape[1:]))

        if config.pca > 0:
            if config.info:
                print("Starting PCA decomposition...")

            pca = PCA(n_components=config.pca)
            features = pca.fit_transform(features)

        stime = None
        etime = None
        if config.verbose > 0:
            print("Done extraction...starting KMeans")
            stime = time.time()

        km = KMeans(n_clusters=clusters,
                    init='k-means++',
                    n_jobs=int(cpu_count / 2)).fit(features)

        if config.verbose > 0:
            etime = time.time()
            td = timedelta(seconds=(etime - stime))
            print("KMeans took {}".format(td))

    un_clusters = {k: [] for k in range(config.clusters)}

    #Distributes items in clusters in descending order of uncertainty
    for iid in un_indexes:
        un_clusters[km.labels_[iid]].append(iid)

    #Save clusters
    if config.save_var:
        fid = 'al-clustermetadata-{1}-r{0}.pik'.format(acq, model.name)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        cache_m.dump((generator.returnDataAsArray(), un_clusters, un_indexes),
                     fid)

    #If debug
    if config.debug:
        expected = generator.returnLabelsFromIndex()
        for k in range(len(un_clusters)):
            ind = np.asarray(un_clusters[k])
            print("Cluster {}, # of items: {}".format(k, ind.shape[0]))
            posa = np.ndarray(shape=(1, ), dtype=np.int32)
            for ii in range(min(ind.shape[0], 30)):
                if ii == 0:
                    posa[0] = np.where(un_indexes == ind[ii])[0]
                else:
                    posa = np.hstack(
                        (posa, np.where(un_indexes == ind[ii])[0]))
            print(
                "Cluster {} first items positions in index array (at most 30): {}"
                .format(k, posa))
            #Check % of items of each class in cluster k
            c_labels = expected[ind]
            unique, count = np.unique(c_labels, return_counts=True)
            l_count = dict(zip(unique, count))
            if len(unique) > 2:
                print("Cluster {} items:".format(k))
                print("\n".join([
                    "label {0}: {1} items".format(key, l_count[key])
                    for key in unique
                ]))
            else:
                if c_labels.shape[0] == 1:
                    l_count[c_labels[0] ^ 1] = 0
                print(
                    "Cluster {3} labels: {0} are 0; {1} are 1;\n - {2:.2f} are positives"
                    .format(l_count[0], l_count[1],
                            (l_count[1] / (l_count[0] + l_count[1])), k))

    ac_count = 0
    acquired = []
    j = 0
    while ac_count < query:
        cln = (ac_count + j) % clusters
        q = un_clusters[cln]
        if len(q) > 0:
            acquired.append(q.pop(0))
            ac_count += 1
        else:
            if verbose > 0:
                print(
                    "[km_uncert] Cluster {} exausted, will try to acquire image from cluster {}"
                    .format(cln, (cln + 1) % clusters))
            j += 1
            continue

    acquired = np.asarray(acquired)
    if config.recluster > 0:
        cache_m.dump((km, acquired), 'clusters.pik')

    return acquired
class GenericDS(ABC):
    """
    Generic class for data feeders used to provide training points to Neural Nets.
    """
    def __init__(self, data_path, keepImg=False, config=None, name='Generic'):
        self.path = None
        if isinstance(data_path, str) and os.path.isdir(data_path):
            self.path = data_path
        else:
            raise ValueError(
                "[GenericImage] Path does not correspond to a file ({0}).".
                format(data_path))

        self.X = None
        self.Y = None
        self.name = name
        self.multi_dir = True
        self._cache = CacheManager()
        self._keep = keepImg
        self._cpu_count = config.cpu_count if not config is None else 1
        self._verbose = config.verbose if not config is None else 0
        self._pbar = config.progressbar if not config is None else False
        self._config = config

    @abstractmethod
    def _load_metadata_from_dir(self, d):
        pass

    @abstractmethod
    def change_root(self, imgv, path):
        """
        Check if SegImage instances in imgv are placed in the same base dir as path. If not, change paths.
        """
        pass

    def check_paths(self, imgv, path):

        for s in imgv:
            s.setPath(self.change_root(s.getPath(), path))

    def get_dataset_dimensions(self, X=None):
        """
        Returns the dimensions of the images in the dataset. It's possible to have different image dimensions.
        WARNING: big datasets will take forever to run. For now, checks a sample of the images.
        TODO: Reimplement this function to be fully parallel (threads in case).

        Return: SORTED list of tuples (# samples,width,height,channels)
        """

        cache_m = CacheManager()
        reload_data = False
        if cache_m.checkFileExistence('data_dims.pik'):
            try:
                dims, name = cache_m.load('data_dims.pik')
            except ValueError:
                reload_data = True
            if name != self.name:
                reload_data = True
        else:
            reload_data = True

        if reload_data:
            dims = set()
            if X is None and self.X is None:
                return None
            elif X is None:
                X = self.X

            samples = len(X)
            if self._config.info:
                print(
                    "Checking a sample of dataset images for different dimensions..."
                )

            s_number = int(0.02 * samples)
            upper_limit = 5000 if s_number > 5000 else s_number
            for seg in random.sample(X, upper_limit):
                dims.add((samples, ) + seg.getImgDim())
            cache_m.dump((dims, self.name), 'data_dims.pik')

        l = list(dims)
        l.sort()
        return l

    def _run_multiprocess(self, data):
        """
        This method should not be called directly. It's intended
        only for multiprocess metadata loading.
        """
        X, Y = ([], [])
        for item in data:
            t_x, t_y = self._load_metadata_from_dir(item)
            X.extend(t_x)
            Y.extend(t_y)

        return (X, Y)

    def _split_data(self, split, X, Y):
        """
        Split data in at most N sets. Returns a tuple (set1,set2,set3,setN) with the divided
        data
        """
        if sum(split) == 1.0:
            it_count = 0
            split_data = []
            start_idx = 0
            samples = len(X)
            for frac in split:
                it_count = int(frac * samples)
                split_data.append((X[start_idx:start_idx + it_count],
                                   Y[start_idx:start_idx + it_count]))
                start_idx += it_count
            return split_data

        else:
            raise ValueError(
                "[GenericDatasource] Spliting values have to equal 1.0")

    def _run_dir(self, path):

        dlist = []
        files = os.listdir(path)
        X, Y = ([], [])

        if self.multi_dir:
            for f in files:
                item = os.path.join(path, f)
                if os.path.isdir(item):
                    dlist.append(item)

            mdata = multiprocess_run(self._run_multiprocess,
                                     tuple(),
                                     dlist,
                                     self._cpu_count,
                                     self._pbar,
                                     step_size=1,
                                     output_dim=2,
                                     txt_label='directories',
                                     verbose=self._verbose)

        else:
            mdata = self._load_metadata_from_dir(self.path)

        X.extend(mdata[0])  #samples
        Y.extend(mdata[1])  #labels

        X, Y = self._shuffle(X, Y)
        return X, Y

    def _shuffle(self, X, Y):
        #Shuffle samples and labels maintaining relative order
        combined = list(zip(X, Y))
        random.shuffle(combined)
        X[:], Y[:] = zip(*combined)

        return X, Y

    def split_metadata(self, split, data=None):
        """
        Returns all metadata split into N sets, defined by the spliting tuples
        
        @param data <tuple>: (X,Y) if provided, split this sequence. Else, split full metadata
        """
        if data is None:
            return self._split_data(split, self.X, self.Y)
        elif len(data) == 2:
            return self._split_data(split, data[0], data[1])
        else:
            return None

    def load_metadata(self, metadata_file='metadata.pik'):
        """
        Iterates over data patches and creates an instance of a GenericImage subclass for each one
        Returns a tuples of lists (X,Y): X instances of GenericImage subclasses, Y labels;

        OBS: Dataset metadata is shuffled once here. Random sample generation is done during training.
        """

        X, Y = (None, None)
        reload_data = False
        reshuffle = False

        if self._cache.checkFileExistence('split_ratio.pik'):
            split = self._cache.load('split_ratio.pik')
            if self._config.split != split:
                #Dump old data
                reshuffle = True
                if not self.X is None or not self.Y is None:
                    del (self.X)
                    del (self.Y)
                    self.X = None
                    self.Y = None

                if self._config.info:
                    print(
                        "Previous split ratio {} is different from requested one {}. Metadata will be reshuffled."
                        .format(split, self._config.split))

        if self._cache.checkFileExistence(metadata_file) and not reload_data:
            try:
                X, Y, name = self._cache.load(metadata_file)
            except ValueError:
                name = ''
                reload_data = True
            if name != self.name:
                reload_data = True

            if not reload_data and not reshuffle and self._verbose > 0:
                print(
                    "[GenericDatasource] Loaded split data cache. Used previously defined splitting."
                )
        else:
            reload_data = True

        if reshuffle:
            X, Y = self._shuffle(X, Y)

        if reload_data:
            X, Y = self._run_dir(self.path)

        if reload_data or reshuffle:
            self._cache.dump((X, Y, self.name), metadata_file)
            self._cache.dump(tuple(self._config.split), 'split_ratio.pik')

        self.X = X.copy()
        self.Y = Y.copy()
        return X, Y

    def load_data(self, split=None, keepImg=False, data=None):
        """
        Actually reads images and returns data ready for training
        Returns two tuples of NP arrays (X,Y): X data points, Y labels;

        @param split <tuple>: items are spliting fractions

        If a spliting ratio is provided, return a list of tuples of size at most 3:
        1 - Train;
        2 - Validation;
        3 - Test;
        
        @param keepImg <bool>: Keep image data in memory
        @param data <tuple>: metadata defining images to load. If not provided, full dataset is used.
        """

        if data is None and (self.X is None or self.Y is None):
            if self._verbose > 0:
                print("[GenericDatasource] Metadata not ready, loading...")
            self.load_metadata()

        #Which data to use?
        X, Y = None, None
        if data is None:
            X = self.X
            Y = self.Y
        else:
            X, Y = data

        if self._config.pred_size > 0:
            samples = self._config.pred_size
        else:
            samples = len(X)
        y = np.array(Y[:samples], dtype=np.int32)
        if not self._config.tdim is None and len(self._config.tdim) == 2:
            img_dim = tuple(self._config.tdim) + (3, )
        else:
            dataset_dim = self.get_dataset_dimensions(X)[0]
            img_dim = dataset_dim[1:]
        X_data = np.zeros(shape=(samples, ) + img_dim, dtype=np.float32)

        counter = 0
        futures = []

        executor = concurrent.futures.ThreadPoolExecutor(max_workers=7)
        for i in range(samples):
            futures.append(
                executor.submit(X[i].readImage, keepImg, img_dim,
                                self._verbose))

        if self._pbar:
            l = tqdm(desc="Reading images...", total=samples, position=0)
        elif self._config.info:
            print("Reading images...")

        #for future in concurrent.futures.as_completed(futures):
        for i in range(samples):
            X_data[i] = futures[i].result()
            if self._pbar:
                l.update(1)
            elif self._verbose > 0:
                print(".", end='')

        if self._pbar:
            l.close()
        elif self._verbose > 0:
            print('\n')

        if split is None:
            return (X_data, y)
        else:
            return self._split_data(split, X_data, y)

    def sample_metadata(self, k):
        """
        Produces a sample of the full metadata with k items. Returns a cached sample if one exists

        @param k <int>: total of samples
        @param k <float>: percentile of the whole dataset

        Return:
        - tuple (X,Y): X an Y have k elements
        """

        reload_data = False
        s_x, s_y = (None, None)
        if self._cache.checkFileExistence('sampled_metadata.pik'):
            try:
                s_x, s_y, name = self._cache.load('sampled_metadata.pik')
            except ValueError:
                name = ''
                reload_data = True
            if name != self.name:
                reload_data = True

            #Check if we have the desired number of items
            if k <= 1.0:
                k = int(k * len(self.X))
            else:
                k = int(k)
            if k != len(s_x):
                if self._config.info:
                    print(
                        "Saved samples are different from requested ({} x {}). Resampling..."
                        .format(k, len(s_x)))
                reload_data = True

            if not reload_data and self._verbose > 0:
                print(
                    "[GenericDatasource] Loaded split sampled data cache. Used previously defined splitting."
                )
        else:
            reload_data = True

        if reload_data and (self.X is None or self.Y is None):
            if self._config.verbose > 1:
                print("[GenericDatasource] Run load_metadata first!")
            return None

        if reload_data:
            if k <= 1.0:
                k = int(k * len(self.X))
            else:
                k = int(k)

            samples = np.random.choice(range(len(self.X)), k, replace=False)

            s_x = [self.X[s] for s in samples]
            s_y = [self.Y[s] for s in samples]

        #Save last generated sample
        self._cache.dump((s_x, s_y, self.name), 'sampled_metadata.pik')
        return (s_x, s_y)
Exemple #11
0
def bayesian_varratios(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference

    Function needs to extract the following configuration parameters:
    model <keras.Model>: model to use for predictions
    generator <keras.Sequence>: data generator for predictions
    data_size <int>: number of data samples
    mc_dp <int>: number of dropout iterations
    cpu_count <int>: number of cpu cores (used to define number of generator workers)
    gpu_count <int>: number of gpus available
    verbose <int>: verbosity level
    pbar <boolean>: user progress bars
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        mc_dp = config.dropout_steps
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    fidp = None
    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)
        if config.debug:
            fidp = 'al-probs-{1}-r{0}.pik'.format(r, config.ac_function)
            cache_m.registerFile(os.path.join(config.logdir, fidp), fidp)

    All_Dropout_Classes = np.zeros(shape=(data_size, 1))

    if pbar:
        l = tqdm(range(mc_dp), desc="MC Dropout", position=0)
    else:
        if config.info:
            print("Starting MC dropout sampling...")
        l = range(mc_dp)

    #Keep probabilities for analysis
    all_probs = None
    if config.debug:
        all_probs = np.zeros(shape=(mc_dp, data_size, generator.classes))

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, mc_dp))

        #Keep verbosity in 0 to gain speed
        proba = pred_model.predict_generator(generator,
                                             workers=5 * cpu_count,
                                             max_queue_size=100 * gpu_count,
                                             verbose=0)

        if config.debug:
            all_probs[d] = proba

        dropout_classes = proba.argmax(axis=-1)
        dropout_classes = np.array([dropout_classes]).T
        All_Dropout_Classes = np.append(All_Dropout_Classes,
                                        dropout_classes,
                                        axis=1)

    if verbose > 0:
        print("All dropout {0}:".format(All_Dropout_Classes.shape))
        for i in np.random.choice(All_Dropout_Classes.shape[0],
                                  100,
                                  replace=False):
            print("Predictions for image ({0}): {1}".format(
                i, All_Dropout_Classes[i]))

    Variation = np.zeros(shape=(data_size))

    for t in range(data_size):
        L = np.array([0])
        for d_iter in range(mc_dp):
            L = np.append(L, All_Dropout_Classes[t, d_iter + 1])
        Predicted_Class, Mode = mode(L[1:])
        v = np.array([1 - Mode / float(mc_dp)])
        Variation[t] = v

    if verbose > 1:
        print("Variation {0}:".format(data_size))
        for i in np.random.choice(data_size, 100, replace=False):
            print("Variation for image ({0}): {1}".format(i, Variation[i]))

    a_1d = Variation.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if config.debug:
        from .Common import debug_acquisition
        s_expected = generator.returnLabelsFromIndex(x_pool_index)
        #After transposition shape will be (classes,items,mc_dp)
        s_probs = all_probs[:mc_dp, x_pool_index].T
        debug_acquisition(s_expected, s_probs, generator.classes, cache_m,
                          config, fidp)

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's variation: {0}".format(a_1d[x_pool_index]))
        print("Maximum variation in pool: {0}".format(a_1d.max()))

    return x_pool_index
Exemple #12
0
def bayesian_bald(pred_model, generator, data_size, **kwargs):
    """
    Calculation as defined in paper:
    Bayesian convolutional neural networks with Bernoulli approximate variational inference
    """
    from Utils import CacheManager
    cache_m = CacheManager()

    if 'config' in kwargs:
        config = kwargs['config']
        mc_dp = config.dropout_steps
        gpu_count = config.gpu_count
        cpu_count = config.cpu_count
        verbose = config.verbose
        pbar = config.progressbar
        query = config.acquire
        save_var = config.save_var
    else:
        return None

    if 'acquisition' in kwargs:
        r = kwargs['acquisition']

    if save_var:
        fid = 'al-uncertainty-{1}-r{0}.pik'.format(r, config.ac_function)
        cache_m.registerFile(os.path.join(config.logdir, fid), fid)

    All_Entropy_Dropout = np.zeros(shape=data_size)
    score_All = np.zeros(shape=(data_size, generator.classes))

    if pbar:
        l = tqdm(range(mc_dp), desc="MC Dropout", position=0)
    else:
        if config.info:
            print("Starting MC dropout sampling...")
        l = range(mc_dp)

    for d in l:
        if not pbar and config.info:
            print("Step {0}/{1}".format(d + 1, mc_dp))

        dropout_score = pred_model.predict_generator(generator,
                                                     workers=5 * cpu_count,
                                                     max_queue_size=100 *
                                                     gpu_count,
                                                     verbose=0)
        #computing G_X
        score_All = score_All + dropout_score

        #computing F_X
        dropout_score_log = np.log2(dropout_score)
        Entropy_Compute = -np.multiply(dropout_score, dropout_score_log)
        Entropy_Per_Dropout = np.sum(Entropy_Compute, axis=1)

        All_Entropy_Dropout = All_Entropy_Dropout + Entropy_Per_Dropout

    Avg_Pi = np.divide(score_All, mc_dp)
    Log_Avg_Pi = np.log2(Avg_Pi)
    Entropy_Avg_Pi = -np.multiply(Avg_Pi, Log_Avg_Pi)
    Entropy_Average_Pi = np.sum(Entropy_Avg_Pi, axis=1)

    G_X = Entropy_Average_Pi

    Average_Entropy = np.divide(All_Entropy_Dropout, mc_dp)

    F_X = Average_Entropy

    U_X = G_X - F_X

    # THIS FINDS THE MINIMUM INDEX
    # a_1d = U_X.flatten()
    # x_pool_index = a_1d.argsort()[-Queries:]

    a_1d = U_X.flatten()
    x_pool_index = a_1d.argsort()[-query:][::-1]

    if save_var:
        cache_m.dump((x_pool_index, a_1d), fid)

    if verbose > 0:
        #print("Selected item indexes: {0}".format(x_pool_index))
        print("Selected item's average entropy: {0}".format(
            a_1d[x_pool_index]))
        print("Maximum entropy in pool: {0}".format(a_1d.max()))

    return x_pool_index
Exemple #13
0
    def run_test(self, model, x_test=None, y_test=None, load_full=True):
        """
        This should be executed after a model has been trained
        """

        cache_m = CacheManager()
        split = None
        if os.path.isfile(cache_m.fileLocation('split_ratio.pik')):
            split = cache_m.load('split_ratio.pik')
        else:
            print(
                "[Predictor] A previously trained model and dataset should exist. No previously defined spliting found."
            )
            return Exitcodes.RUNTIME_ERROR

        #Priority is for given data as parameters. If None is given, try to load metadata as configured
        if x_test is None or y_test is None:
            if self._config.testdir is None:
                #Load sampled data if required by command line
                if self._config.sample < 1.0:
                    _, _, (x_test, y_test) = self._ds.split_metadata(
                        split=split,
                        data=self._ds.sample_metadata(self._config.sample))
                else:
                    _, _, (x_test, y_test) = self._ds.split_metadata(split)
            else:
                x_test, y_test = self._ds._run_dir(self._config.testdir)

        if self._config.verbose > 0:
            unique, count = np.unique(y_test, return_counts=True)
            l_count = dict(zip(unique, count))
            if len(unique) > 2:
                print("Test items:")
                print("\n".join([
                    "label {0}: {1} items".format(key, l_count[key])
                    for key in unique
                ]))
            else:
                if not 1 in l_count:
                    l_count[1] = 0
                print(
                    "Test labels: {0} are 0; {1} are 1;\n - {2:.2f} are positives"
                    .format(l_count[0], l_count[1],
                            (l_count[1] / (l_count[0] + l_count[1]))))
            print("Test set: {} items".format(len(y_test)))

        X, Y = self._ds.load_data(data=(x_test, y_test), keepImg=self._keep)
        if self._config.verbose > 1:
            print("Y original ({1}):\n{0}".format(Y, Y.shape))
        Y = to_categorical(Y, self._ds.nclasses)

        # session setup
        sess = K.get_session()
        ses_config = tf.ConfigProto(
            device_count={
                "CPU": self._config.cpu_count,
                "GPU": self._config.gpu_count
            },
            intra_op_parallelism_threads=self._config.cpu_count
            if self._config.gpu_count == 0 else self._config.gpu_count,
            inter_op_parallelism_threads=self._config.cpu_count
            if self._config.gpu_count == 0 else self._config.gpu_count,
            log_device_placement=True if self._verbose > 1 else False)
        sess.config = ses_config
        K.set_session(sess)

        #During test phase multi-gpu mode is not used (maybe done latter)
        if self._ensemble:
            #Weights should be loaded during ensemble build
            if hasattr(model, 'build_ensemble'):
                pred_model = model.build_ensemble(training=False, npfile=True)
            else:
                if self._config.info:
                    print(
                        '[Predictor] Model not prepared to build ensembles, implement or choose other model'
                    )
                return None
        elif load_full and os.path.isfile(model.get_model_cache()):
            try:
                pred_model = load_model(model.get_model_cache())
                if self._config.info:
                    print("Model loaded from: {0}".format(
                        model.get_model_cache()))
            except ValueError:
                pred_model, _ = model.build(training=False, pre_load_w=False)
                pred_model.load_weights(model.get_weights_cache())
                if self._config.info:
                    print("Model weights loaded from: {0}".format(
                        model.get_weights_cache()))
        elif os.path.isfile(model.get_weights_cache()):
            pred_model, _ = model.build(training=False, pre_load_w=False)
            pred_model.load_weights(model.get_weights_cache())
            if self._config.info:
                print("Model weights loaded from: {0}".format(
                    model.get_weights_cache()))

        else:
            if self._config.info:
                print("No trained model or weights file found")
            return None

        bsize = self._config.batch_size
        stp = round((len(X) / bsize) + 0.5)

        image_generator = ImageDataGenerator(
            samplewise_center=self._config.batch_norm,
            samplewise_std_normalization=self._config.batch_norm)

        if self._ensemble:
            if not self._config.tdim is None:
                fix_dim = self._config.tdim
            else:
                fix_dim = self._ds.get_dataset_dimensions()[0][
                    1:]  #Only smallest image dimensions matter here
            test_generator = SingleGenerator(
                dps=(X, Y),
                classes=self._ds.nclasses,
                dim=fix_dim,
                batch_size=self._config.batch_size,
                image_generator=image_generator,
                extra_aug=self._config.augment,
                shuffle=False,
                verbose=self._verbose,
                input_n=self._config.emodels)
        else:
            test_generator = image_generator.flow(x=X,
                                                  y=Y,
                                                  batch_size=bsize,
                                                  shuffle=False)

        if self._config.progressbar:
            l = tqdm(desc="Making predictions...", total=stp)

        Y_pred = np.zeros((len(X), self._ds.nclasses), dtype=np.float32)
        for i in range(stp):
            start_idx = i * bsize
            example = test_generator.next()
            Y_pred[start_idx:start_idx + bsize] = pred_model.predict_on_batch(
                example[0])
            if self._config.progressbar:
                l.update(1)
            elif self._config.info:
                print("Batch prediction ({0}/{1})".format(i, stp))
            if self._config.verbose > 1:
                if not np.array_equal(Y[start_idx:start_idx + bsize],
                                      example[1]):
                    print(
                        "Datasource label ({0}) and batch label ({1}) differ".
                        format(Y[start_idx:start_idx + bsize], example[1]))

        del (X)
        del (test_generator)

        if self._config.progressbar:
            l.close()

        y_pred = np.argmax(Y_pred, axis=1)
        expected = np.argmax(Y, axis=1)

        if self._config.verbose > 0:
            if self._config.verbose > 1:
                np.set_printoptions(threshold=np.inf)
                print("Predicted probs ({1}):\n{0}".format(
                    Y_pred, Y_pred.shape))
            #print("Y ({1}):\n{0}".format(Y,Y.shape))
            print("expected ({1}):\n{0}".format(expected, expected.shape))
            print("Predicted ({1}):\n{0}".format(y_pred, y_pred.shape))

        #Save predictions
        cache_m.dump((expected, Y_pred, self._ds.nclasses), 'test_pred.pik')

        #Output metrics
        print_prediction(self._config)