Beispiel #1
0
 def run_N(self,
           nb_execution=10,
           loop=100,
           grphq=False,
           pas=10,
           duration_gif=0.5):
     """
     Exécute N itération de l'algorithme des k-means, et conserve les centres qui produisent le moins d'erreur.
     Chaque itération est produite à partir de centres initiaux aléatoires, donc les résultats sont différents à chaque fois.
     Retourne cette erreur minimale.
     Les paramètres d'entrée sont les même que pour run, avec l'ajout de :
         nb_execution : entier désignant le nombre de calcul de k-means à faire.
     """
     f = partial(self.__k_run, loop=loop, grphq=grphq, pas=pas)
     pool = Pool(self.cpu)
     memory = list(pool.uimap(f, range(nb_execution)))
     pool.close()
     pool.join()
     ind = np.argmin(np.array([m[0] for m in memory]))
     means = memory[ind][1]
     self.means = means
     self.calc_grp()
     if grphq: self.grphq.create_gif(duration=duration_gif)
     del pool
     return memory[ind][0]
Beispiel #2
0
def test_pathos_pp_callable () :
    """Test parallel processnig with pathos: ParallelPool  
    """
    logger = getLogger("ostap.test_pathos_pp_callable")         
    if not pathos :
        logger.error ( "pathos is not available" )
        return
    
    logger.info ('Test job submission with %s' %  pathos ) 
    
    if DILL_PY3_issue : 
        logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" )
        return

    ## logger.warning ("test is disabled for UNKNOWN REASON")
    ## return

    from pathos.helpers import cpu_count
    ncpus = cpu_count  ()
    
    from pathos.pools import ParallelPool as Pool 

    pool = Pool ( ncpus )   
    logger.info ( "Pool is %s" %  ( type ( pool ).__name__ ) )

    pool.restart ( True ) 


    mh   = MakeHisto() 
    jobs = pool.uimap ( mh.process ,  [  ( i , n )  for  ( i , n ) in enumerate ( inputs ) ] )
    
    result = None 
    for h in progress_bar ( jobs , max_value = len ( inputs ) ) :
        if not result  : result = h
        else           : result.Add ( h )

    pool.close ()
    pool.join  ()
    pool.clear ()
    
    logger.info ( "Histogram is %s" % result.dump ( 80 , 10 )  )
    logger.info ( "Entries  %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) 
    
    with wait ( 1 ) , use_canvas ( 'test_pathos_pp_callable' ) : 
        result.draw (   ) 

    return result 
Beispiel #3
0
def test_pathos_mp_function () :
    """Test parallel processnig with pathos: ProcessPool
    """
    logger = getLogger("ostap.test_pathos_mp_function")
    if not pathos :
        logger.error ( "pathos is not available" )
        return 
    
    logger.info ('Test job submission with %s' %  pathos ) 
    
    if DILL_PY3_issue : 
        logger.warning ("test is disabled (DILL/ROOT/PY3 issue)" )
        return
    
    from pathos.helpers import cpu_count
    ncpus = cpu_count  ()
    
    from pathos.pools import ProcessPool as Pool

    pool = Pool ( ncpus )
    logger.info ( "Pool is %s" % ( type ( pool ).__name__ ) )

    with pool_context   ( pool ) : 
        
        jobs = pool.uimap ( make_histo ,  zip ( count() , inputs ) )
        
        result = None 
        for h in progress_bar ( jobs , max_value = len ( inputs ) ) :
            if not result  : result = h
            else           : result.Add ( h )
                
    logger.info ( "Histogram is %s" % result.dump ( 80 , 10 )  )
    logger.info ( "Entries  %s/%s" % ( result.GetEntries() , sum ( inputs ) ) ) 
    
    with wait ( 1 ) , use_canvas ( 'test_pathos_mp_function' ) : 
        result.draw (   ) 

    return result 
Beispiel #4
0
class ReMap(ReIterBase):
    def __init__(self,
                 fn,
                 iterable_input,
                 proc_type=None,
                 n_proc=1,
                 per_proc_buffer=1,
                 ordered=True,
                 name='reMap',
                 verbose=True):
        """
        This is a map function that can be iterated over more than once. Returns an iterator.

        Parameters
        ----------
        fn
        iterable_input
            iterable input

        proc_type
            if 'sub' then uses a pathos ProcessPool to map function
            if 'thread' then uses standard multiprocessing ThreadPool
            else uses regular map

        n_proc
            number of workers in a pool (ignored if no pool)

        per_proc_buffer
            since pool's map function does not know limits, there is a forced stop-and-yield-all after
            this many processed tasks per process/thread

        ordered
            use ordered map by default, uses `imap_unordered` otherwise

        name
            name to use for logging messages

        verbose
        """
        name += '' if proc_type not in ('sub', 'proc', 'subprocess', 'th',
                                        'thread') else ' ' + proc_type
        super().__init__(iterable_input=iterable_input,
                         name=name,
                         verbose=verbose)
        self.fn = fn
        self.proc_type = proc_type
        self.per_proc_buffer = per_proc_buffer
        self.n_proc = n_proc
        self.ordered = ordered

    def _iter(self):
        if self.proc_type in ('thread', 'th') and self.n_proc > 0:
            with ThreadPool(self.n_proc) as p:
                # this is a workaround for limiting input iterator consumption, got it from SO
                buff = []
                for itm in self.iterable_input:
                    buff.append(itm)
                    if len(buff) >= self.per_proc_buffer * self.n_proc:

                        if self.ordered:
                            for itm in p.imap(self.fn, buff):
                                yield itm
                        else:
                            for itm in p.imap_unordered(self.fn, buff):
                                yield itm
                        buff = []

                # feed the remaining buffer after input is exhausted
                if self.ordered:
                    for itm in p.imap(self.fn, buff):
                        yield itm
                else:
                    for itm in p.imap_unordered(self.fn, buff):
                        yield itm

        elif self.proc_type in ('sub', 'proc',
                                'subprocess') and self.n_proc > 0:
            try:
                log.info("Trying to terminate previous pool")
                # this is stupid, but that's how pathos is built
                self.pool.terminate()
                self.pool.clear()
                log.info("Yay! Cleared previous process pool")
            except AttributeError:
                log.warning("Is this the first time creating a pool...")

            self.pool = ProcessPool(nodes=self.n_proc)

            # this is a workaround for limiting input iterator consumption, got it from SO
            buff = []
            for itm in self.iterable_input:
                buff.append(itm)
                if len(buff) >= self.per_proc_buffer * self.n_proc:
                    if self.ordered:
                        for itm in self.pool.imap(self.fn, buff):
                            yield itm
                    else:
                        for itm in self.pool.uimap(self.fn, buff):
                            yield itm
                    buff = []

            # feed the remaining buffer after input is exhausted
            if self.ordered:
                for itm in self.pool.imap(self.fn, buff):
                    yield itm
            else:
                for itm in self.pool.uimap(self.fn, buff):
                    yield itm

        else:
            for itm in map(self.fn, self.iterable_input):
                yield itm
Beispiel #5
0
 def run_global_automated(self,
                          grphq=False,
                          duration_gif=0.5,
                          pas=1,
                          B=10,
                          loop=100):
     """
     Implémentation modifiée de run_global où le choix du nombre de cluster est déterminé par des statistiques calculés au fur et à mesure.
     Les paramètres sont : grphq, duration_gif, pas, B, loop et correspondent aux définitions évoqués dans run_global.
     Paramètre de sortie : instance idéal de Kmeans.
     """
     pool = Pool(self.cpu)
     pool.close()
     pool.join()
     mini, maxi = np.min(self.data.data, axis=0), np.max(self.data.data,
                                                         axis=0)
     shape = self.data.data.shape
     i = 1
     self.set_nb_cluster(i)
     self.choose_means_initiate()
     self.calc_grp()
     self.choose_means()
     self.calc_grp()
     means = self.means
     if grphq:
         self.grphq.plot_graph(self.data.data, self.grp,
                               self.means.reshape((1, -1)), 1)
     self.print_meta_data()
     gap, var = self.gap_stat_mono(self.error, i, mini, maxi, shape, pool,
                                   B)
     cond = True
     km_cpy = self.copy(erase_dir=False)
     print("Fin de l'étape {}".format(i))
     while cond:
         i += 1
         self.set_nb_cluster(i)
         pool.restart()
         f = partial(self.__multi_j, loop=loop, means=means)
         s = pool.uimap(f, range(0, self.L, pas))
         pool.close()
         pool.join()
         s = np.array(list(s))
         arg = np.argmin(s[:, 1])
         j = int(s[arg, 0])
         means_cpy = np.vstack((means, self.data.data[j]))
         self.means = means_cpy
         k = 0
         backup = (None, None, -1, -1)
         self.calc_grp()
         while (self.cond_conv(backup)) and (k < loop):
             k += 1
             backup = self.backup_metadata()
             self.choose_means()
             if ((self.choose_means != self.choose_means_moy_true)
                     and (self.choose_means != self.choose_means_med_true)):
                 self.calc_grp()
             self.migration = np.count_nonzero(
                 (self.grp[:, 1] - backup[1][:, 1]))
             self.same_means = np.array_equal(self.means, backup[0])
         means = self.means
         gap_f, var_f = self.gap_stat_mono(self.error, i, mini, maxi, shape,
                                           pool, B)
         diff = gap - (gap_f - var_f)
         print("Gap statistical (étape {}) : {}".format(i - 1, diff))
         if grphq:
             self.grphq.plot_graph(self.data.data, self.grp, self.means, i)
         self.print_meta_data()
         print("Fin de l'étape {}".format(i))
         if diff >= 0:
             break
         else:
             gap = gap_f
             km_cpy = self.copy(erase_dir=False)
     if grphq: self.grphq.create_gif(duration=duration_gif)
     self = km_cpy.copy(erase_dir=False)
     self.calc_grp()
     print("Le nombre optimal de classes est : {}".format(self.nb_cluster))
     del pool
     return self
Beispiel #6
0
 def run_global(self,
                loop=100,
                grphq=False,
                duration_gif=0.5,
                pas=1,
                choose_nb_graph=False,
                B=10):
     """
     Implémente l'algorithme des global k-means qui calcule incrémentalement 
     la configuration optimale des groupes pour un nombre de clusters donnée.
     L'algotrithme procède comme suit :
         0) On définit le nombre cluster à 1 et on calcule le centre de la matrice de données.
         1) On incrément le nombre de cluster.
         On définit comme centre les centres de l'étape précédente.
         On définit successivement chaque individu de la matrice de données comme dernier centre, on exécute l'algorithme du
             k-means avec chaque lot de centres et on garde le lot de centre qui minimise l'erreur.
         i+1) On réitère l'étape précédente jusqu'à obtenir le bon nombre de groupe.
     !!! Très gourmand en ressources.
     
     Paramètres d'entrée :
         loop : entier définissant le nombre d'itérations au sein des calcule de k-means avant arrêt du calcul, défaut = 100
         grphq : boolean indiquant si les graphes doivent être affichés et enregistrés.
         duration_gif : réel qui caractérise la durée de chaque image dans la production du gif final, inutile si grphq = False
         pas : entier qui détermine l'écart entre chaque individu à tester pour le choix des individus comme centre.
         choose_nb_graph : boolean, affiche un lot de statistiques qui permettent de déterminer le nombre idéal de clusters.
         B : entier qui qui entre en jeu dans le calcul des statistiques évoquées précédemment.
     Paramètre de sortie :
         err : erreur de classification pour le nombre de cluster choisi.
     """
     pool = Pool(self.cpu)
     pool.close()
     pool.join()
     err = []
     n = self.nb_cluster
     self.set_nb_cluster(1)
     self.choose_means_initiate()
     self.calc_grp()
     self.choose_means()
     self.calc_grp()
     means = self.means
     err.append([1, self.error, self.clustering_error_rel(), self.var])
     if grphq:
         self.grphq.plot_graph(self.data.data, self.grp,
                               self.means.reshape((1, -1)), 1)
     self.print_meta_data()
     print("Fin de l'étape {}".format(1))
     for i in range(2, n + 1):
         self.set_nb_cluster(i)
         pool.restart()
         f = partial(self.__multi_j, loop=loop, means=means)
         s = pool.uimap(f, range(0, self.L, pas))
         pool.close()
         pool.join()
         s = np.array(list(s))
         arg = np.argmin(s[:, 1])
         j = int(s[arg, 0])
         means_cpy = np.vstack((means, self.data.data[j]))
         self.means = means_cpy
         k = 0
         backup = (None, None, -1, -1)
         self.calc_grp()
         while (self.cond_conv(backup)) and (k < loop):
             k += 1
             backup = self.backup_metadata()
             self.choose_means()
             if ((self.choose_means != self.choose_means_moy_true)
                     and (self.choose_means != self.choose_means_med_true)):
                 self.calc_grp()
             self.migration = np.count_nonzero(
                 (self.grp[:, 1] - backup[1][:, 1]))
             self.same_means = np.array_equal(self.means, backup[0])
         means = self.means
         if grphq:
             self.grphq.plot_graph(self.data.data, self.grp, self.means, i)
         self.print_meta_data()
         err.append([i, self.error, self.clustering_error_rel(), self.var])
         print("Fin de l'étape {}".format(i))
     err = np.array(err)
     err = err[np.argsort(err[:, 0]), :].T
     if grphq: self.grphq.create_gif(duration=duration_gif)
     if choose_nb_graph:
         self.grphq.plot_crb_err_cluster(self.gap_stat(err, B))
     del pool
     return err