Ejemplo n.º 1
0
                    sync_every=args.sync_every,
                    worker_optimizer=args.worker_optimizer,
                    worker_optimizer_params=args.worker_optimizer_params)
    if args.restore:
        algo.load(args.restore)

    # Creating the MPIManager object causes all needed worker and master nodes to be created
    manager = MPIManager(comm=comm,
                         data=data,
                         algo=algo,
                         model_builder=model_builder,
                         num_epochs=args.epochs,
                         train_list=train_list,
                         val_list=val_list,
                         num_masters=args.masters,
                         num_processes=args.processes,
                         synchronous=args.synchronous,
                         verbose=args.verbose,
                         monitor=args.monitor,
                         early_stopping=args.early_stopping,
                         target_metric=args.target_metric,
                         thread_validation=args.thread_validation,
                         checkpoint=args.checkpoint,
                         checkpoint_interval=args.checkpoint_interval)

    # Process 0 launches the training procedure
    if comm.Get_rank() == 0:
        logging.debug('Training configuration: %s', algo.get_config())

        t_0 = time()
        histories = manager.process.train()
Ejemplo n.º 2
0
        algo = Algo(args.optimizer,
                    loss=args.loss,
                    validate_every=validate_every,
                    sync_every=args.sync_every,
                    worker_optimizer=args.worker_optimizer)
    if args.restore:
        algo.load(args.restore)

    # Creating the MPIManager object causes all needed worker and master nodes to be created
    manager = MPIManager(comm=comm,
                         data=data,
                         algo=algo,
                         model_builder=model_builder,
                         num_epochs=args.epochs,
                         train_list=train_list,
                         val_list=val_list,
                         num_masters=args.masters,
                         synchronous=args.synchronous,
                         verbose=args.verbose,
                         monitor=args.monitor,
                         early_stopping=args.early_stopping,
                         target_metric=args.target_metric)

    # Process 0 launches the training procedure
    if comm.Get_rank() == 0:
        print(algo)

        t_0 = time()
        histories = manager.process.train()
        delta_t = time() - t_0
        manager.free_comms()
Ejemplo n.º 3
0
    def _execute_MPI(self,
                    comm=None,
                    # masters=1,
                    # easgd=False,
                    archiveTraining=True,
                    archiveValidation=True,
                    verbose=1):
        from mpi4py import MPI
        from mpi_learn.mpi.manager import MPIManager, get_device
        from mpi_learn.train.algo import Algo
        from mpi_learn.train.data import H5Data
        from mpi_learn.train.model import ModelFromJson

        #return prep_func
        #print(self.custom_objects)
        #print(custom_objects)
        #print(Lorentz, Slice)
        #raise ValueError()
        load_weights = True
        # synchronous = False
        # sync_every = 1
        # MPIoptimizer = "rmsprop"
        # batch_size = 100
        
        if(comm == None):
            comm = MPI.COMM_WORLD.Dup()



        # if(not isinstance(self.train_procedure,list)): self.train_procedure = [self.train_procedure]
        # if(not isinstance(self.val_procedure,list)): self.val_procedure = [self.val_procedure]
        if(not(isinstance(self.train_procedure,list))):
            raise ValueError("Trial attribute train_procedure: expected list of DataProcedures or paths but got type %r" % type(self.train_procedure))
        if(not(isinstance(self.val_procedure,list))):
            raise ValueError("Trial attribute val_procedure: expected list of DataProcedures or paths but got type %r" % type(self.val_procedure))

        train = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.train_procedure]
        val = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.val_procedure]

        # if(not isinstance(train, list) or not False in [isinstance(x,DataProcedure) or isinstance(x,string_types) for x in train]):
        #     raise ValueError("Train procedure must be list of DataProcedures")
        # if(not isinstance(val, list) or not False in [isinstance(x, DataProcedure) or isinstance(x, string_types) for x in val]):
        #     raise ValueError("Validation procedure must be list of DataProcedures")
        batchAssertArchived(train)
        batchAssertArchived(val)
        def assertStr(x):
            if(isinstance(x,DataProcedure)):
                return dp.get_path() + "archive.h5"
            elif(os.path.isfile(x)):
                return x  
            else:
                raise IOError("Cannot find %r" % x)
                
        train_list = [assertStr(x) for x in train]
        val_list = [assertStr(x) for dp in val]
        # print("Train List:", train_list)
        # print("Val List:", val_list)

        # There is an issue when multiple processes import Keras simultaneously --
        # the file .keras/keras.json is sometimes not read correctly.  
        # as a workaround, just try several times to import keras.
        # Note: importing keras imports theano -- 
        # impossible to change GPU choice after this.
        for try_num in range(10):
            try:
                from keras.models import model_from_json
                import keras.callbacks as cbks
                break
            except ValueError:
                print "Unable to import keras. Trying again: %d" % try_num
                sleep(0.1)


        custom_objects = {}
        for name, module in self.custom_objects.items():
            try:
                #my_module = importlib.import_module('os.path')
                custom_objects[name] = getattr(importlib.import_module(module), name)
                #exec("from " + module +  " import " + name)
            except:
                raise ValueError("Custom Object %r does not exist in %r. \
                    For best results Custom Objects should be importable and not locally defined." % (str(name), str(module)))

        # We initialize the Data object with the training data list
        # so that we can use it to count the number of training examples

        data = H5Data(batch_size=self.batch_size, 
                features_name=self.features_name, labels_name=self.labels_name)
        data.set_file_names(train_list)
        num_train = data.count_data()
        


        # if comm.Get_rank() == 0:
        validate_every = num_train/self.batch_size
       
        

        if self.easgd:
            # raise NotImplementedError("Not implemented")
            algo = Algo(None, loss=self.loss, validate_every=validate_every,
                    mode='easgd', elastic_lr=1.0, sync_every=self.sync_every,
                    worker_optimizer='sgd',
                    elastic_force=0.9/(comm.Get_size()-1)) 
        else:
            algo = Algo(self.master_optimizer, loss=self.loss, validate_every=validate_every,
                    sync_every=self.sync_every, worker_optimizer=self.optimizer) 

        #model = self.compile(custom_objects=custom_objects)
        #model_arch = model.to_json()
        #print(self.get_path()+"trial.json")
        model_builder = ModelFromJson( comm,json_str=self.model,custom_objects=custom_objects )

        callbacks = self._generateCallbacks(verbose=verbose)

        # Creating the MPIManager object causes all needed worker and master nodes to be created
        manager = MPIManager(comm=comm, data=data, num_epochs=self.epochs if hasattr(self,'epochs') else self.nb_epoch,
                             algo=algo, model_builder=model_builder,
                             train_list=train_list, val_list=val_list, num_masters=self.masters,
                             synchronous=self.synchronous, callbacks=callbacks, custom_objects=custom_objects)


        # Process 0 defines the model and propagates it to the workers.
        if comm.Get_rank() == 0:
            record = self.read_record()
            if(not "num_train" in record):
                self.to_record({"num_train": num_train})
            if(not "num_val" in record):
                val_data = H5Data( val_list, batch_size=self.batch_size,
                features_name=self.features_name, labels_name=self.labels_name)
                self.to_record({"num_val": val_data.count_data()})

            print(custom_objects)
            
            
            print algo
            #weights = model.get_weights()

            #manager.process.set_model_info( model_arch, algo, weights )
            t_0 = time()
            histories = manager.process.train() 
            delta_t = time() - t_0
            manager.free_comms()
            print "Training finished in %.3f seconds" % delta_t
            print(histories)
Ejemplo n.º 4
0
    else:
        algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every,
                sync_every=args.sync_every, worker_optimizer=args.worker_optimizer) 

    # Most Keras callbacks are supported
    callbacks = []
    callbacks.append( cbks.ModelCheckpoint( '_'.join([
        model_name,args.trial_name,"mpi_learn_result.h5"]), 
        monitor='val_loss', verbose=1 ) )
    if args.early_stopping is not None:
        callbacks.append( cbks.EarlyStopping( patience=args.early_stopping,
            verbose=1 ) )

    # Creating the MPIManager object causes all needed worker and master nodes to be created
    manager = MPIManager( comm=comm, data=data, algo=algo, model_builder=model_builder,
            num_epochs=args.epochs, train_list=train_list, val_list=val_list, 
            num_masters=args.masters, synchronous=args.synchronous, 
            callbacks=callbacks, verbose=args.verbose )

    # Process 0 launches the training procedure
    if comm.Get_rank() == 0:
        print (algo)

        t_0 = time()
        histories = manager.process.train() 
        delta_t = time() - t_0
        manager.free_comms()
        print ("Training finished in {0:.3f} seconds".format(delta_t))

        # Make output dictionary
        out_dict = { "args":vars(args),
                     "history":histories,