sync_every=args.sync_every, worker_optimizer=args.worker_optimizer, worker_optimizer_params=args.worker_optimizer_params) if args.restore: algo.load(args.restore) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager(comm=comm, data=data, algo=algo, model_builder=model_builder, num_epochs=args.epochs, train_list=train_list, val_list=val_list, num_masters=args.masters, num_processes=args.processes, synchronous=args.synchronous, verbose=args.verbose, monitor=args.monitor, early_stopping=args.early_stopping, target_metric=args.target_metric, thread_validation=args.thread_validation, checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) # Process 0 launches the training procedure if comm.Get_rank() == 0: logging.debug('Training configuration: %s', algo.get_config()) t_0 = time() histories = manager.process.train()
algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every, sync_every=args.sync_every, worker_optimizer=args.worker_optimizer) if args.restore: algo.load(args.restore) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager(comm=comm, data=data, algo=algo, model_builder=model_builder, num_epochs=args.epochs, train_list=train_list, val_list=val_list, num_masters=args.masters, synchronous=args.synchronous, verbose=args.verbose, monitor=args.monitor, early_stopping=args.early_stopping, target_metric=args.target_metric) # Process 0 launches the training procedure if comm.Get_rank() == 0: print(algo) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 manager.free_comms()
def _execute_MPI(self, comm=None, # masters=1, # easgd=False, archiveTraining=True, archiveValidation=True, verbose=1): from mpi4py import MPI from mpi_learn.mpi.manager import MPIManager, get_device from mpi_learn.train.algo import Algo from mpi_learn.train.data import H5Data from mpi_learn.train.model import ModelFromJson #return prep_func #print(self.custom_objects) #print(custom_objects) #print(Lorentz, Slice) #raise ValueError() load_weights = True # synchronous = False # sync_every = 1 # MPIoptimizer = "rmsprop" # batch_size = 100 if(comm == None): comm = MPI.COMM_WORLD.Dup() # if(not isinstance(self.train_procedure,list)): self.train_procedure = [self.train_procedure] # if(not isinstance(self.val_procedure,list)): self.val_procedure = [self.val_procedure] if(not(isinstance(self.train_procedure,list))): raise ValueError("Trial attribute train_procedure: expected list of DataProcedures or paths but got type %r" % type(self.train_procedure)) if(not(isinstance(self.val_procedure,list))): raise ValueError("Trial attribute val_procedure: expected list of DataProcedures or paths but got type %r" % type(self.val_procedure)) train = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.train_procedure] val = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.val_procedure] # if(not isinstance(train, list) or not False in [isinstance(x,DataProcedure) or isinstance(x,string_types) for x in train]): # raise ValueError("Train procedure must be list of DataProcedures") # if(not isinstance(val, list) or not False in [isinstance(x, DataProcedure) or isinstance(x, string_types) for x in val]): # raise ValueError("Validation procedure must be list of DataProcedures") batchAssertArchived(train) batchAssertArchived(val) def assertStr(x): if(isinstance(x,DataProcedure)): return dp.get_path() + "archive.h5" elif(os.path.isfile(x)): return x else: raise IOError("Cannot find %r" % x) train_list = [assertStr(x) for x in train] val_list = [assertStr(x) for dp in val] # print("Train List:", train_list) # print("Val List:", val_list) # There is an issue when multiple processes import Keras simultaneously -- # the file .keras/keras.json is sometimes not read correctly. # as a workaround, just try several times to import keras. # Note: importing keras imports theano -- # impossible to change GPU choice after this. for try_num in range(10): try: from keras.models import model_from_json import keras.callbacks as cbks break except ValueError: print "Unable to import keras. Trying again: %d" % try_num sleep(0.1) custom_objects = {} for name, module in self.custom_objects.items(): try: #my_module = importlib.import_module('os.path') custom_objects[name] = getattr(importlib.import_module(module), name) #exec("from " + module + " import " + name) except: raise ValueError("Custom Object %r does not exist in %r. \ For best results Custom Objects should be importable and not locally defined." % (str(name), str(module))) # We initialize the Data object with the training data list # so that we can use it to count the number of training examples data = H5Data(batch_size=self.batch_size, features_name=self.features_name, labels_name=self.labels_name) data.set_file_names(train_list) num_train = data.count_data() # if comm.Get_rank() == 0: validate_every = num_train/self.batch_size if self.easgd: # raise NotImplementedError("Not implemented") algo = Algo(None, loss=self.loss, validate_every=validate_every, mode='easgd', elastic_lr=1.0, sync_every=self.sync_every, worker_optimizer='sgd', elastic_force=0.9/(comm.Get_size()-1)) else: algo = Algo(self.master_optimizer, loss=self.loss, validate_every=validate_every, sync_every=self.sync_every, worker_optimizer=self.optimizer) #model = self.compile(custom_objects=custom_objects) #model_arch = model.to_json() #print(self.get_path()+"trial.json") model_builder = ModelFromJson( comm,json_str=self.model,custom_objects=custom_objects ) callbacks = self._generateCallbacks(verbose=verbose) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager(comm=comm, data=data, num_epochs=self.epochs if hasattr(self,'epochs') else self.nb_epoch, algo=algo, model_builder=model_builder, train_list=train_list, val_list=val_list, num_masters=self.masters, synchronous=self.synchronous, callbacks=callbacks, custom_objects=custom_objects) # Process 0 defines the model and propagates it to the workers. if comm.Get_rank() == 0: record = self.read_record() if(not "num_train" in record): self.to_record({"num_train": num_train}) if(not "num_val" in record): val_data = H5Data( val_list, batch_size=self.batch_size, features_name=self.features_name, labels_name=self.labels_name) self.to_record({"num_val": val_data.count_data()}) print(custom_objects) print algo #weights = model.get_weights() #manager.process.set_model_info( model_arch, algo, weights ) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 manager.free_comms() print "Training finished in %.3f seconds" % delta_t print(histories)
else: algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every, sync_every=args.sync_every, worker_optimizer=args.worker_optimizer) # Most Keras callbacks are supported callbacks = [] callbacks.append( cbks.ModelCheckpoint( '_'.join([ model_name,args.trial_name,"mpi_learn_result.h5"]), monitor='val_loss', verbose=1 ) ) if args.early_stopping is not None: callbacks.append( cbks.EarlyStopping( patience=args.early_stopping, verbose=1 ) ) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager( comm=comm, data=data, algo=algo, model_builder=model_builder, num_epochs=args.epochs, train_list=train_list, val_list=val_list, num_masters=args.masters, synchronous=args.synchronous, callbacks=callbacks, verbose=args.verbose ) # Process 0 launches the training procedure if comm.Get_rank() == 0: print (algo) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 manager.free_comms() print ("Training finished in {0:.3f} seconds".format(delta_t)) # Make output dictionary out_dict = { "args":vars(args), "history":histories,