logging.debug("Using device {}".format(model_builder.device)) else: model_builder = ModelFromJson(comm, args.model, weights=model_weights) logging.debug("using device {}".format(device)) os.environ[ 'THEANO_FLAGS'] = "profile=%s,device=%s,floatX=float32" % ( args.profile, device.replace('gpu', 'cuda')) # GPU ops need to be executed synchronously in order for profiling to make sense if args.profile: os.environ['CUDA_LAUNCH_BLOCKING'] = '1' data = H5Data(batch_size=args.batch, cache=args.caching_dir, preloading=args.data_preload, features_name=args.features_name, labels_name=args.labels_name) # We initialize the Data object with the training data list # so that we can use it to count the number of training examples data.set_file_names(train_list) validate_every = int(data.count_data() / args.batch) # Some input arguments may be ignored depending on chosen algorithm if args.mode == 'easgd': algo = Algo(None, loss=args.loss, validate_every=validate_every, mode='easgd', sync_every=args.sync_every, worker_optimizer=args.worker_optimizer,
def _execute_MPI(self, comm=None, # masters=1, # easgd=False, archiveTraining=True, archiveValidation=True, verbose=1): from mpi4py import MPI from mpi_learn.mpi.manager import MPIManager, get_device from mpi_learn.train.algo import Algo from mpi_learn.train.data import H5Data from mpi_learn.train.model import ModelFromJson #return prep_func #print(self.custom_objects) #print(custom_objects) #print(Lorentz, Slice) #raise ValueError() load_weights = True # synchronous = False # sync_every = 1 # MPIoptimizer = "rmsprop" # batch_size = 100 if(comm == None): comm = MPI.COMM_WORLD.Dup() # if(not isinstance(self.train_procedure,list)): self.train_procedure = [self.train_procedure] # if(not isinstance(self.val_procedure,list)): self.val_procedure = [self.val_procedure] if(not(isinstance(self.train_procedure,list))): raise ValueError("Trial attribute train_procedure: expected list of DataProcedures or paths but got type %r" % type(self.train_procedure)) if(not(isinstance(self.val_procedure,list))): raise ValueError("Trial attribute val_procedure: expected list of DataProcedures or paths but got type %r" % type(self.val_procedure)) train = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.train_procedure] val = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.val_procedure] # if(not isinstance(train, list) or not False in [isinstance(x,DataProcedure) or isinstance(x,string_types) for x in train]): # raise ValueError("Train procedure must be list of DataProcedures") # if(not isinstance(val, list) or not False in [isinstance(x, DataProcedure) or isinstance(x, string_types) for x in val]): # raise ValueError("Validation procedure must be list of DataProcedures") batchAssertArchived(train) batchAssertArchived(val) def assertStr(x): if(isinstance(x,DataProcedure)): return dp.get_path() + "archive.h5" elif(os.path.isfile(x)): return x else: raise IOError("Cannot find %r" % x) train_list = [assertStr(x) for x in train] val_list = [assertStr(x) for dp in val] # print("Train List:", train_list) # print("Val List:", val_list) # There is an issue when multiple processes import Keras simultaneously -- # the file .keras/keras.json is sometimes not read correctly. # as a workaround, just try several times to import keras. # Note: importing keras imports theano -- # impossible to change GPU choice after this. for try_num in range(10): try: from keras.models import model_from_json import keras.callbacks as cbks break except ValueError: print "Unable to import keras. Trying again: %d" % try_num sleep(0.1) custom_objects = {} for name, module in self.custom_objects.items(): try: #my_module = importlib.import_module('os.path') custom_objects[name] = getattr(importlib.import_module(module), name) #exec("from " + module + " import " + name) except: raise ValueError("Custom Object %r does not exist in %r. \ For best results Custom Objects should be importable and not locally defined." % (str(name), str(module))) # We initialize the Data object with the training data list # so that we can use it to count the number of training examples data = H5Data(batch_size=self.batch_size, features_name=self.features_name, labels_name=self.labels_name) data.set_file_names(train_list) num_train = data.count_data() # if comm.Get_rank() == 0: validate_every = num_train/self.batch_size if self.easgd: # raise NotImplementedError("Not implemented") algo = Algo(None, loss=self.loss, validate_every=validate_every, mode='easgd', elastic_lr=1.0, sync_every=self.sync_every, worker_optimizer='sgd', elastic_force=0.9/(comm.Get_size()-1)) else: algo = Algo(self.master_optimizer, loss=self.loss, validate_every=validate_every, sync_every=self.sync_every, worker_optimizer=self.optimizer) #model = self.compile(custom_objects=custom_objects) #model_arch = model.to_json() #print(self.get_path()+"trial.json") model_builder = ModelFromJson( comm,json_str=self.model,custom_objects=custom_objects ) callbacks = self._generateCallbacks(verbose=verbose) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager(comm=comm, data=data, num_epochs=self.epochs if hasattr(self,'epochs') else self.nb_epoch, algo=algo, model_builder=model_builder, train_list=train_list, val_list=val_list, num_masters=self.masters, synchronous=self.synchronous, callbacks=callbacks, custom_objects=custom_objects) # Process 0 defines the model and propagates it to the workers. if comm.Get_rank() == 0: record = self.read_record() if(not "num_train" in record): self.to_record({"num_train": num_train}) if(not "num_val" in record): val_data = H5Data( val_list, batch_size=self.batch_size, features_name=self.features_name, labels_name=self.labels_name) self.to_record({"num_val": val_data.count_data()}) print(custom_objects) print algo #weights = model.get_weights() #manager.process.set_model_info( model_arch, algo, weights ) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 manager.free_comms() print "Training finished in %.3f seconds" % delta_t print(histories)
backend = 'tensorflow' model_builder = ModelFromJsonTF( comm, args.model_json, device_name=device , weights=args.model_weights) print ("Process {0} using device {1}".format(comm.Get_rank(), model_builder.device)) else: backend = 'theano' model_builder = ModelFromJson( comm, args.model_json ,weights=args.model_weights) print ("Process {0} using device {1}".format(comm.Get_rank(),device)) os.environ['THEANO_FLAGS'] = "profile=%s,device=%s,floatX=float32" % (args.profile,device) # GPU ops need to be executed synchronously in order for profiling to make sense if args.profile: os.environ['CUDA_LAUNCH_BLOCKING'] = '1' os.environ['KERAS_BACKEND'] = backend import_keras() import keras.callbacks as cbks data = H5Data( batch_size=args.batch, features_name=args.features_name, labels_name=args.labels_name ) # We initialize the Data object with the training data list # so that we can use it to count the number of training examples data.set_file_names( train_list ) validate_every = data.count_data()/args.batch # Some input arguments may be ignored depending on chosen algorithm if args.easgd: algo = Algo(None, loss=args.loss, validate_every=validate_every, mode='easgd', sync_every=args.sync_every, worker_optimizer=args.worker_optimizer, elastic_force=args.elastic_force/(comm.Get_size()-1), elastic_lr=args.elastic_lr, elastic_momentum=args.elastic_momentum) else: algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every,
tag+='rmsprop_' if not fresh: try: gm.generator.load_weights('simple_generator.h5') gm.discriminator.load_weights('simple_discriminator.h5') except: print ("fresh weights") else: tag+='fresh_' print (tag,"is the option") files = list(filter(None,open('train_3d_energy.list').read().split('\n'))) data = H5Data( batch_size = 100, cache = options.cache, preloading=0, features_name='X', labels_name='y') data.set_file_names(files) """ if options.inmem: import os relocated = [] os.system('mkdir /dev/shm/vlimant/') for fn in files: relocate = '/dev/shm/vlimant/'+fn.split('/')[-1] if not os.path.isfile( relocate ): print ("copying %s to %s"%( fn , relocate)) if os.system('cp %s %s'%( fn ,relocate))==0: relocated.append( relocate ) files = relocated
param_ranges = [ (0.0, 1.0), # dropout (1, 6), # kernel_size (1., 10.), # lr exponent ] # MPI process 0 coordinates the Bayesian optimization procedure if block_num == 0: model_fn = lambda x, y, z: mpi.test_cnn(x, y, np.exp(-z)) opt_coordinator = coordinator.Coordinator(comm_world, num_blocks, param_ranges, model_fn) opt_coordinator.run(num_iterations=30) else: data = H5Data(batch_size=args.batch, features_name='Images', labels_name='Labels') data.set_file_names(train_list) validate_every = data.count_data() / args.batch algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every, sync_every=args.sync_every) os.environ['KERAS_BACKEND'] = backend import_keras() import keras.callbacks as cbks callbacks = [] if args.early_stopping is not None: callbacks.append( cbks.EarlyStopping(patience=args.early_stopping, verbose=1)) block = process_block.ProcessBlock(comm_world,