Exemple #1
0
            logging.debug("Using device {}".format(model_builder.device))
        else:
            model_builder = ModelFromJson(comm,
                                          args.model,
                                          weights=model_weights)
            logging.debug("using device {}".format(device))
            os.environ[
                'THEANO_FLAGS'] = "profile=%s,device=%s,floatX=float32" % (
                    args.profile, device.replace('gpu', 'cuda'))
            # GPU ops need to be executed synchronously in order for profiling to make sense
        if args.profile:
            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

    data = H5Data(batch_size=args.batch,
                  cache=args.caching_dir,
                  preloading=args.data_preload,
                  features_name=args.features_name,
                  labels_name=args.labels_name)
    # We initialize the Data object with the training data list
    # so that we can use it to count the number of training examples
    data.set_file_names(train_list)
    validate_every = int(data.count_data() / args.batch)

    # Some input arguments may be ignored depending on chosen algorithm
    if args.mode == 'easgd':
        algo = Algo(None,
                    loss=args.loss,
                    validate_every=validate_every,
                    mode='easgd',
                    sync_every=args.sync_every,
                    worker_optimizer=args.worker_optimizer,
Exemple #2
0
    def _execute_MPI(self,
                    comm=None,
                    # masters=1,
                    # easgd=False,
                    archiveTraining=True,
                    archiveValidation=True,
                    verbose=1):
        from mpi4py import MPI
        from mpi_learn.mpi.manager import MPIManager, get_device
        from mpi_learn.train.algo import Algo
        from mpi_learn.train.data import H5Data
        from mpi_learn.train.model import ModelFromJson

        #return prep_func
        #print(self.custom_objects)
        #print(custom_objects)
        #print(Lorentz, Slice)
        #raise ValueError()
        load_weights = True
        # synchronous = False
        # sync_every = 1
        # MPIoptimizer = "rmsprop"
        # batch_size = 100
        
        if(comm == None):
            comm = MPI.COMM_WORLD.Dup()



        # if(not isinstance(self.train_procedure,list)): self.train_procedure = [self.train_procedure]
        # if(not isinstance(self.val_procedure,list)): self.val_procedure = [self.val_procedure]
        if(not(isinstance(self.train_procedure,list))):
            raise ValueError("Trial attribute train_procedure: expected list of DataProcedures or paths but got type %r" % type(self.train_procedure))
        if(not(isinstance(self.val_procedure,list))):
            raise ValueError("Trial attribute val_procedure: expected list of DataProcedures or paths but got type %r" % type(self.val_procedure))

        train = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.train_procedure]
        val = [DataProcedure.from_json(self.archive_dir,x) if isinstance(x,DataProcedure) else str(x) for x in self.val_procedure]

        # if(not isinstance(train, list) or not False in [isinstance(x,DataProcedure) or isinstance(x,string_types) for x in train]):
        #     raise ValueError("Train procedure must be list of DataProcedures")
        # if(not isinstance(val, list) or not False in [isinstance(x, DataProcedure) or isinstance(x, string_types) for x in val]):
        #     raise ValueError("Validation procedure must be list of DataProcedures")
        batchAssertArchived(train)
        batchAssertArchived(val)
        def assertStr(x):
            if(isinstance(x,DataProcedure)):
                return dp.get_path() + "archive.h5"
            elif(os.path.isfile(x)):
                return x  
            else:
                raise IOError("Cannot find %r" % x)
                
        train_list = [assertStr(x) for x in train]
        val_list = [assertStr(x) for dp in val]
        # print("Train List:", train_list)
        # print("Val List:", val_list)

        # There is an issue when multiple processes import Keras simultaneously --
        # the file .keras/keras.json is sometimes not read correctly.  
        # as a workaround, just try several times to import keras.
        # Note: importing keras imports theano -- 
        # impossible to change GPU choice after this.
        for try_num in range(10):
            try:
                from keras.models import model_from_json
                import keras.callbacks as cbks
                break
            except ValueError:
                print "Unable to import keras. Trying again: %d" % try_num
                sleep(0.1)


        custom_objects = {}
        for name, module in self.custom_objects.items():
            try:
                #my_module = importlib.import_module('os.path')
                custom_objects[name] = getattr(importlib.import_module(module), name)
                #exec("from " + module +  " import " + name)
            except:
                raise ValueError("Custom Object %r does not exist in %r. \
                    For best results Custom Objects should be importable and not locally defined." % (str(name), str(module)))

        # We initialize the Data object with the training data list
        # so that we can use it to count the number of training examples

        data = H5Data(batch_size=self.batch_size, 
                features_name=self.features_name, labels_name=self.labels_name)
        data.set_file_names(train_list)
        num_train = data.count_data()
        


        # if comm.Get_rank() == 0:
        validate_every = num_train/self.batch_size
       
        

        if self.easgd:
            # raise NotImplementedError("Not implemented")
            algo = Algo(None, loss=self.loss, validate_every=validate_every,
                    mode='easgd', elastic_lr=1.0, sync_every=self.sync_every,
                    worker_optimizer='sgd',
                    elastic_force=0.9/(comm.Get_size()-1)) 
        else:
            algo = Algo(self.master_optimizer, loss=self.loss, validate_every=validate_every,
                    sync_every=self.sync_every, worker_optimizer=self.optimizer) 

        #model = self.compile(custom_objects=custom_objects)
        #model_arch = model.to_json()
        #print(self.get_path()+"trial.json")
        model_builder = ModelFromJson( comm,json_str=self.model,custom_objects=custom_objects )

        callbacks = self._generateCallbacks(verbose=verbose)

        # Creating the MPIManager object causes all needed worker and master nodes to be created
        manager = MPIManager(comm=comm, data=data, num_epochs=self.epochs if hasattr(self,'epochs') else self.nb_epoch,
                             algo=algo, model_builder=model_builder,
                             train_list=train_list, val_list=val_list, num_masters=self.masters,
                             synchronous=self.synchronous, callbacks=callbacks, custom_objects=custom_objects)


        # Process 0 defines the model and propagates it to the workers.
        if comm.Get_rank() == 0:
            record = self.read_record()
            if(not "num_train" in record):
                self.to_record({"num_train": num_train})
            if(not "num_val" in record):
                val_data = H5Data( val_list, batch_size=self.batch_size,
                features_name=self.features_name, labels_name=self.labels_name)
                self.to_record({"num_val": val_data.count_data()})

            print(custom_objects)
            
            
            print algo
            #weights = model.get_weights()

            #manager.process.set_model_info( model_arch, algo, weights )
            t_0 = time()
            histories = manager.process.train() 
            delta_t = time() - t_0
            manager.free_comms()
            print "Training finished in %.3f seconds" % delta_t
            print(histories)
Exemple #3
0
        backend = 'tensorflow'
        model_builder = ModelFromJsonTF( comm, args.model_json, device_name=device , weights=args.model_weights)
        print ("Process {0} using device {1}".format(comm.Get_rank(), model_builder.device))
    else:
        backend = 'theano'
        model_builder = ModelFromJson( comm, args.model_json ,weights=args.model_weights)
        print ("Process {0} using device {1}".format(comm.Get_rank(),device))
        os.environ['THEANO_FLAGS'] = "profile=%s,device=%s,floatX=float32" % (args.profile,device)
        # GPU ops need to be executed synchronously in order for profiling to make sense
        if args.profile:
            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
    os.environ['KERAS_BACKEND'] = backend
    import_keras()
    import keras.callbacks as cbks

    data = H5Data( batch_size=args.batch, 
            features_name=args.features_name, labels_name=args.labels_name )
    # We initialize the Data object with the training data list
    # so that we can use it to count the number of training examples
    data.set_file_names( train_list )
    validate_every = data.count_data()/args.batch 

    # Some input arguments may be ignored depending on chosen algorithm
    if args.easgd:
        algo = Algo(None, loss=args.loss, validate_every=validate_every,
                mode='easgd', sync_every=args.sync_every,
                worker_optimizer=args.worker_optimizer,
                elastic_force=args.elastic_force/(comm.Get_size()-1),
                elastic_lr=args.elastic_lr, 
                elastic_momentum=args.elastic_momentum) 
    else:
        algo = Algo(args.optimizer, loss=args.loss, validate_every=validate_every,
        tag+='rmsprop_'

    if not fresh:
        try:
            gm.generator.load_weights('simple_generator.h5')
            gm.discriminator.load_weights('simple_discriminator.h5')
        except:
            print ("fresh weights")
    else:
        tag+='fresh_'

print (tag,"is the option")

files = list(filter(None,open('train_3d_energy.list').read().split('\n')))
data = H5Data( batch_size = 100,
               cache = options.cache,
               preloading=0,
               features_name='X', labels_name='y')
data.set_file_names(files)
"""

if options.inmem:
    import os
    relocated = []
    os.system('mkdir /dev/shm/vlimant/')
    for fn in files:
        relocate = '/dev/shm/vlimant/'+fn.split('/')[-1]
        if not os.path.isfile( relocate ):
            print ("copying %s to %s"%( fn , relocate))
            if os.system('cp %s %s'%( fn ,relocate))==0:
                relocated.append( relocate )
    files = relocated
Exemple #5
0
    param_ranges = [
        (0.0, 1.0),  # dropout
        (1, 6),  # kernel_size
        (1., 10.),  # lr exponent
    ]

    # MPI process 0 coordinates the Bayesian optimization procedure
    if block_num == 0:
        model_fn = lambda x, y, z: mpi.test_cnn(x, y, np.exp(-z))
        opt_coordinator = coordinator.Coordinator(comm_world, num_blocks,
                                                  param_ranges, model_fn)
        opt_coordinator.run(num_iterations=30)
    else:
        data = H5Data(batch_size=args.batch,
                      features_name='Images',
                      labels_name='Labels')
        data.set_file_names(train_list)
        validate_every = data.count_data() / args.batch
        algo = Algo(args.optimizer,
                    loss=args.loss,
                    validate_every=validate_every,
                    sync_every=args.sync_every)
        os.environ['KERAS_BACKEND'] = backend
        import_keras()
        import keras.callbacks as cbks
        callbacks = []
        if args.early_stopping is not None:
            callbacks.append(
                cbks.EarlyStopping(patience=args.early_stopping, verbose=1))
        block = process_block.ProcessBlock(comm_world,