Beispiel #1
0

def make_features_labels(m_module, args):
    features_name = m_module.get_features(
    ) if m_module is not None and hasattr(
        m_module, "get_features") else args.features_name
    labels_name = m_module.get_labels() if m_module is not None and hasattr(
        m_module, "get_labels") else args.labels_name
    return (features_name, labels_name)


if __name__ == '__main__':
    parser = make_train_parser()
    args = parser.parse_args()
    initialize_logger(filename=args.log_file,
                      file_level=args.log_level,
                      stream_level=args.log_level)

    a_backend = args.backend
    if 'torch' in args.model:
        a_backend = 'torch'

    m_module = __import__(args.model.replace('.py', '').replace('/', '.'),
                          fromlist=[None]) if '.py' in args.model else None
    (features_name, labels_name) = make_features_labels(m_module, args)
    (train_list, val_list) = make_train_val_lists(m_module, args)
    comm = MPI.COMM_WORLD.Dup()

    if args.timeline: Timeline.enable()

    use_tf = a_backend == 'keras'
Beispiel #2
0
def main():
    from TrainingDriver import add_loader_options
    parser = argparse.ArgumentParser()
    parser.add_argument('--verbose',help='display metrics for each training batch',action='store_true')
    parser.add_argument('--profile',help='profile theano code',action='store_true')
    parser.add_argument('--monitor',help='Monitor cpu and gpu utilization', action='store_true')
    parser.add_argument('--tf', help='use tensorflow backend', action='store_true')

    # model arguments
    parser.add_argument('model_json', help='JSON file containing model architecture')
    parser.add_argument('--trial-name', help='descriptive name for trial', 
            default='train', dest='trial_name')

    # training data arguments
    parser.add_argument('train_data', help='text file listing data inputs for training')
    parser.add_argument('val_data', help='text file listing data inputs for validation')
    parser.add_argument('--features-name', help='name of HDF5 dataset with input features',
            default='features', dest='features_name')
    parser.add_argument('--labels-name', help='name of HDF5 dataset with output labels',
            default='labels', dest='labels_name')
    parser.add_argument('--batch', help='batch size', default=100, type=int)
    add_loader_options(parser)

    # configuration of network topology
    parser.add_argument('--masters', help='number of master processes', default=1, type=int)
    parser.add_argument('--n-processes', dest='processes', help='number of processes per worker', default=1, type=int)
    parser.add_argument('--max-gpus', dest='max_gpus', help='max GPUs to use', 
            type=int, default=-1)
    parser.add_argument('--master-gpu',help='master process should get a gpu',
            action='store_true', dest='master_gpu')
    parser.add_argument('--synchronous',help='run in synchronous mode',action='store_true')

    # configuration of training process
    parser.add_argument('--epochs', help='number of training epochs', default=1, type=int)
    parser.add_argument('--optimizer',help='optimizer for master to use',default='adam')
    parser.add_argument('--loss',help='loss function',default='binary_crossentropy')
    parser.add_argument('--early-stopping', default=None,
                        dest='early_stopping', help='Configuration for early stopping')
    parser.add_argument('--target-metric', default=None,
                        dest='target_metric', help='Passing configuration for a target metric')
    parser.add_argument('--worker-optimizer',help='optimizer for workers to use',
            dest='worker_optimizer', default='sgd')
    parser.add_argument('--worker-optimizer-params',help='worker optimizer parameters (string representation of a dict)',
            dest='worker_optimizer_params', default='{}')
    parser.add_argument('--sync-every', help='how often to sync weights with master', 
            default=1, type=int, dest='sync_every')
    parser.add_argument('--mode',help='Mode of operation.'
                        'One of "downpour" (Downpour), "easgd" (Elastic Averaging SGD) or "gem" (Gradient Energy Matching)',default='downpour',choices=['downpour','easgd','gem'])

    parser.add_argument('--elastic-force',help='beta parameter for EASGD',type=float,default=0.9)
    parser.add_argument('--elastic-lr',help='worker SGD learning rate for EASGD',
            type=float, default=1.0, dest='elastic_lr')
    parser.add_argument('--elastic-momentum',help='worker SGD momentum for EASGD',
            type=float, default=0, dest='elastic_momentum')
    parser.add_argument('--restore', help='pass a file to retore the variables from', default=None)
    parser.add_argument('--log-file', default=None, dest='log_file', help='log file to write, in additon to output stream')
    parser.add_argument('--log-level', default='info', dest='log_level', help='log level (debug, info, warn, error)')

    parser.add_argument('--checkpoint', help='Base name of the checkpointing file. If omitted no checkpointing will be done', default=None)
    parser.add_argument('--checkpoint-interval', help='Number of epochs between checkpoints', default=5, type=int, dest='checkpoint_interval')
    

    args = parser.parse_args()
    model_name = os.path.basename(args.model_json).replace('.json','')
    initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level)    

    with open(args.train_data) as train_list_file:
        train_list = [ s.strip() for s in train_list_file.readlines() ]
    with open(args.val_data) as val_list_file:
        val_list = [ s.strip() for s in val_list_file.readlines() ]

    comm = MPI.COMM_WORLD.Dup()

    use_tf = args.tf
    use_torch = not use_tf
    
    from TrainingDriver import make_model_weight, make_algo, make_loader
    model_weights = make_model_weight(args, use_torch)

    device = get_device( comm, args.masters, gpu_limit=args.max_gpus,
                gpu_for_master=args.master_gpu)
    if use_tf:
        backend = 'tensorflow'
        if not args.optimizer.endswith("tf"):
            args.optimizer = args.optimizer + 'tf'
        os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else ''
        logging.info('set to device %s %s'%(os.environ['CUDA_VISIBLE_DEVICES'], socket.gethostname()))
    os.environ['KERAS_BACKEND'] = backend

    logging.info(backend)
    if use_tf:
        import_keras()
        import keras.backend as K
        gpu_options=K.tf.GPUOptions(
            per_process_gpu_memory_fraction=0.0,
            allow_growth = True,)
        K.set_session( K.tf.Session( config=K.tf.ConfigProto(
            allow_soft_placement=True,
            #allow_soft_placement=False,
            #log_device_placement=True , # was false
            log_device_placement=False , # was false
            gpu_options=gpu_options
            ) ) )

    if use_tf:
        from nnlo.train.GanModel import GANModelBuilder
        model_builder  = GANModelBuilder( comm , tf= True, weights=model_weights)


    data = make_loader(args, args.features_name, args.labels_name, train_list)
    algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch ))

    if args.restore:
        algo.load(args.restore)

    # Creating the MPIManager object causes all needed worker and master nodes to be created
    manager = MPIManager( comm=comm, data=data, algo=algo, model_builder=model_builder,
                          num_epochs=args.epochs, train_list=train_list, val_list=val_list, 
                          num_masters=args.masters, num_processes=args.processes,
                          synchronous=args.synchronous, 
                          verbose=args.verbose , monitor=args.monitor,
                          early_stopping=args.early_stopping,target_metric=args.target_metric ,
                          checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval)

    # Process 0 launches the training procedure
    if comm.Get_rank() == 0:
        logging.info(algo)

        t_0 = time()
        histories = manager.process.train() 
        delta_t = time() - t_0
        manager.free_comms()
        logging.info("Training finished in {0:.3f} seconds".format(delta_t))

        json_name = '_'.join([model_name,args.trial_name,"history.json"]) 
        manager.process.record_details(json_name,
                                       meta={"args":vars(args)})
        logging.info("Wrote trial information to {0}".format(json_name))

    comm.Barrier()
    logging.info("Terminating")
Beispiel #3
0
def main():
    parser = make_train_parser()
    args = parser.parse_args()
    initialize_logger(filename=args.log_file,
                      file_level=args.log_level,
                      stream_level=args.log_level)

    a_backend = args.backend
    if 'torch' in args.model:
        a_backend = 'torch'

    m_module, model_source = None, None
    try:
        if args.model == 'mnist':
            m_module = importlib.import_module(f'nnlo.models.model_mnist_tf')
            model_source = 'models/model_mnist_tf.py'
        elif args.model == 'mnist_torch':
            m_module = importlib.import_module(
                f'nnlo.models.model_mnist_torch')
            model_source = 'models/model_mnist_torch.py'
        elif args.model == 'cifar10':
            m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf')
            model_source = 'models/model_cifar10_tf.py'
    except Exception as e:
        logging.fatal(e)

    (features_name, labels_name) = make_features_labels(m_module, args)
    (train_list, val_list) = make_train_val_lists(m_module, args)
    comm = MPI.COMM_WORLD.Dup()

    if args.timeline: Timeline.enable()

    use_tf = a_backend == 'keras'
    use_torch = not use_tf

    model_weights = make_model_weight(args, use_torch)

    # Theano is the default backend; use tensorflow if --tf is specified.
    # In the theano case it is necessary to specify the device before importing.
    device = get_device(comm,
                        args.n_masters,
                        gpu_limit=args.max_gpus,
                        gpu_for_master=args.master_gpu)
    os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else ''
    logging.debug('set to device %s', os.environ['CUDA_VISIBLE_DEVICES'])

    if use_torch:
        logging.debug("Using pytorch")
        model_builder = ModelPytorch(comm,
                                     source=model_source,
                                     weights=model_weights,
                                     gpus=1 if 'gpu' in device else 0)
    else:
        logging.debug("Using TensorFlow")
        os.environ['KERAS_BACKEND'] = 'tensorflow'

        import tensorflow as tf
        import_keras()
        #tf.config.gpu.set_per_process_memory_fraction(0.1)
        #gpu_options=K.tf.GPUOptions(
        #    per_process_gpu_memory_fraction=0.1, #was 0.0
        #    allow_growth = True,
        #    visible_device_list = device[-1] if 'gpu' in device else '')
        #gpu_options=K.tf.GPUOptions(
        #    per_process_gpu_memory_fraction=0.0,
        #    allow_growth = True,)
        gpu_devices = tf.config.experimental.list_physical_devices('GPU')
        for device in gpu_devices:
            tf.config.experimental.set_memory_growth(device, True)

        #NTHREADS=(2,1)
        #NTHREADS=None
        #if NTHREADS is None:
        #    K.set_session( K.tf.Session( config=K.tf.ConfigProto(
        #        allow_soft_placement=True, log_device_placement=False,
        #        gpu_options=gpu_options
        #    ) ) )
        #else:
        #    K.set_session( K.tf.Session( config=K.tf.ConfigProto(
        #        allow_soft_placement=True, log_device_placement=False,
        #        gpu_options=gpu_options,
        #        intra_op_parallelism_threads=NTHREADS[0],
        #        inter_op_parallelism_threads=NTHREADS[1],
        #    ) ) )

        model_builder = ModelTensorFlow(comm,
                                        source=model_source,
                                        weights=model_weights)

    data = make_loader(args, features_name, labels_name, train_list)

    # Some input arguments may be ignored depending on chosen algorithm
    algo = make_algo(args,
                     use_tf,
                     comm,
                     validate_every=int(data.count_data() / args.batch))

    if args.restore:
        algo.load(args.restore)

    # Creating the MPIManager object causes all needed worker and master nodes to be created
    manager = MPIManager(comm=comm,
                         data=data,
                         algo=algo,
                         model_builder=model_builder,
                         num_epochs=args.epochs,
                         train_list=train_list,
                         val_list=val_list,
                         num_masters=args.n_masters,
                         num_processes=args.n_processes,
                         synchronous=args.synchronous,
                         verbose=args.verbose,
                         monitor=args.monitor,
                         early_stopping=args.early_stopping,
                         target_metric=args.target_metric,
                         thread_validation=args.thread_validation,
                         checkpoint=args.checkpoint,
                         checkpoint_interval=args.checkpoint_interval)

    if m_module:
        model_name = m_module.get_name()
    else:
        model_name = os.path.basename(args.model).replace('.json', '')

    json_name = args.output + '/' + '_'.join(
        [model_name, args.trial_name, "history.json"])
    tl_json_name = args.output + '/' + '_'.join(
        [model_name, args.trial_name, "timeline.json"])

    # Process 0 launches the training procedure
    if comm.Get_rank() == 0:
        logging.debug('Training configuration: %s', algo.get_config())

        t_0 = time()
        histories = manager.process.train()
        delta_t = time() - t_0
        logging.info("Training finished in {0:.3f} seconds".format(delta_t))

        manager.process.record_details(json_name, meta={"args": vars(args)})
        logging.info("Wrote trial information to {0}".format(json_name))
        manager.close()

    comm.barrier()
    logging.info("Terminating")
    if args.timeline: Timeline.collect(clean=True, file_name=tl_json_name)
Beispiel #4
0
def main():
    logging.info("Process is on {}".format(socket.gethostname()))
    parser = make_opt_parser()
    args = parser.parse_args()
    check_sanity(args)
    initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level)

    import socket
    host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname()))

    test = args.example
    model_source = args.model
    a_backend = args.backend
    if args.model and 'torch' in args.model:
        a_backend = 'torch'
    use_tf = a_backend == 'keras'
    use_torch = not use_tf

        ##starting the configuration of the processes
    logging.info("Initializing...")
    comm_world = MPI.COMM_WORLD.Dup()
    ## consistency check to make sure everything is appropriate
    num_blocks, left_over = divmod( (comm_world.Get_size()-1), args.block_size)
    if left_over:
        logging.warning("The last block is going to be made of {} nodes, make inconsistent block size {}".format( left_over,
                                                                                                         args.block_size))
        num_blocks += 1 ## to accoun for the last block
        if left_over<2:
            logging.warning("The last block is going to be too small for mpi_learn, with no workers")
        MPI.COMM_WORLD.Abort()

    block_num = get_block_num(comm_world, args.block_size)
    device = get_device(comm_world, num_blocks,
                        gpu_limit=args.max_gpus)
    logging.info("Process {} using device {}".format(comm_world.Get_rank(), device))

    os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else ''
    logging.info('set to device %s',os.environ['CUDA_VISIBLE_DEVICES'])

    if use_tf:
        import keras.backend as K
        gpu_options=K.tf.GPUOptions(
            per_process_gpu_memory_fraction=0.0,
            allow_growth = True,)        
        K.set_session( K.tf.Session( config=K.tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=True,
            gpu_options=gpu_options
        ) ) )

        
    if model_source is not None:
        ## provide the model details here
        module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None])
        if use_tf:
            model_provider = BuilderFromFunction( model_fn = module.get_model )
        else:
            model_provider = TorchBuilderFromFunction( model_fn = module.get_model)

        (train_list, val_list) = make_train_val_lists(module, args)
        (features_name, labels_name) = make_features_labels(module, args)
    elif test == 'topclass':
        ### topclass example
        if not args.torch:
            model_provider = BuilderFromFunction( model_fn = models.make_topclass_model )
        else:
            model_provider = TorchBuilderFromFunction( model_fn = models.make_topclass_torch_model)

        if 'daint' in host:
            train_list = glob.glob('/scratch/snx3000/vlimant/data/LCDJets_Remake/train/*.h5')
            val_list = glob.glob('/scratch/snx3000/vlimant/data/LCDJets_Remake/val/*.h5')
        elif 'titan' in host:
            train_list = glob.glob('/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20/train/*.h5')
            val_list = glob.glob('/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20/val/*.h5')
        else:
            train_list = glob.glob('/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20/train/0*.h5')
            val_list = glob.glob('/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20/val/0*.h5')
        features_name='Images'
        labels_name='Labels'
    elif test == 'mnist':
        ### mnist example
        if args.torch:
            model_provider = TorchBuilderFromFunction( model_fn = models.make_mnist_torch_model)
        else:
            model_provider = BuilderFromFunction( model_fn = models.make_mnist_model)

        if 'daint' in host:
            all_list = glob.glob('/scratch/snx3000/vlimant/data/mnist/*.h5')
        elif 'titan' in host:
            all_list = glob.glob('/ccs/proj/csc291/DATA/mnist/*.h5')
        else:
            all_list = glob.glob('/bigdata/shared/mnist/*.h5')
        l = int( len(all_list)*0.70)
        train_list = all_list[:l]
        val_list = all_list[l:]
        features_name='features'
        labels_name='labels'
    elif test == 'cifar10':
        ### cifar10 example
        model_provider = BuilderFromFunction( model_fn = models.make_cifar10_model )

        if 'daint' in host:
            all_list = []
        elif 'titan' in host:
            all_list = glob.glob('/ccs/proj/csc291/DATA/cifar10/*.h5')
        else:
            all_list = glob.glob('/bigdata/shared/cifar10/*.h5')
        l = int( len(all_list)*0.70)
        train_list = all_list[:l]
        val_list = all_list[l:]
        features_name='features'
        labels_name='labels'
    elif test == 'gan':
        from nnlo.train.GanModel import GANBuilder
        ### the gan example
        model_provider = GANBuilder( parameters = [ Integer(50,400, name='latent_size' ),
                                                    Real(0.0, 1.0, name='discr_drop_out'),
                                                    Categorical([1, 2, 5, 6, 8], name='gen_weight'),
                                                    Categorical([0.1, 0.2, 1, 2, 10], name='aux_weight'),
                                                    Categorical([0.1, 0.2, 1, 2, 10], name='ecal_weight'),
                                                ]
        )
        ## only this mode functions
        setattr(args,"mode",'easgd')
        args.worker_optimizer = 'rmsprop'
        if 'daint' in host:
            all_list = glob.glob('/scratch/snx3000/vlimant/data/3DGAN/*.h5')
        elif 'titan' in host:
            all_list = glob.glob('/ccs/proj/csc291/DATA/3DGAN/*.h5')
        else:
            all_list = glob.glob('/data/shared/3DGAN/*.h5')

        #l = int( len(all_list)*0.70)
        #train_list = all_list[:l]
        #val_list = all_list[l:]
        N= MPI.COMM_WORLD.Get_size()        
        train_list = all_list[:N]
        val_list = all_list[-1:]
        features_name='X'
        labels_name='y'




    if use_torch:
        if 'gpu' in device:
            model_provider.gpus=1
            

    comm_block = comm_world.Split(block_num)
    logging.debug("Process {} sees {} blocks, has block number {}, and rank {} in that block".format(comm_world.Get_rank(),
                                                                                              num_blocks,
                                                                                              block_num,
                                                                                              comm_block.Get_rank()
                                                                                            ))
    if args.n_processes>1:
        t_b_processes= []
        if block_num !=0:
            _,_, b_processes = get_groups(comm_block, args.n_masters, args.n_processes)
            ## collect all block=>world rank translation
            r2r = (comm_block.Get_rank() , comm_world.Get_rank())
            all_r2r = comm_block.allgather( r2r )
            translate = dict( all_r2r ) #key is the rank in block, value is rank in world
            t_b_processes = []
            for pr in b_processes:
                t_pr = []
                for p in pr:
                    t_pr.append( translate[p])
                t_b_processes.append( t_pr )
        
        #need to collect all the processes lists
        all_t_b_processes = comm_world.allgather( t_b_processes )
        w_processes = set()
        for gb in all_t_b_processes:
            if gb:
                hgb = map(tuple, gb)
                w_processes.update( hgb )
        if block_num == 0:
            logging.info("all collect processes {}".format(w_processes))
            ## now you have the ranks that needs to be initialized in rings.

    # MPI process 0 coordinates the Bayesian optimization procedure
    if block_num == 0:
        opt_coordinator = Coordinator(comm_world, num_blocks,
                                      model_provider.parameters,
                                      (args.hyper_opt=='genetic'),args.population,
                                      checkpointing =  args.checkpoint,
                                      label = args.trial_name
        )
        if args.opt_restore: opt_coordinator.load()
        if args.target_objective: opt_coordinator.target_fom = args.target_objective
        opt_coordinator.run(num_iterations=args.num_iterations)
        opt_coordinator.record_details()
    else:
        logging.debug("Process {} on block {}, rank {}, create a process block".format( comm_world.Get_rank(),
                                                                                 block_num,
                                                                                 comm_block.Get_rank()))
        data = make_loader(args, features_name, labels_name, train_list)

        from TrainingDriver import make_algo
        algo = make_algo( args, use_tf, comm_block , validate_every=int(data.count_data()/args.batch ))
 
        block = ProcessBlock(comm_world, comm_block, algo, data, device,
                             model_provider,
                             args.epochs, train_list, val_list, 
                             folds = args.n_fold,
                             num_masters = args.n_masters,
                             num_process = args.n_processes,
                             verbose=args.verbose,
                             early_stopping=args.early_stopping,
                             target_metric=args.target_metric,
                             monitor=args.monitor,
                             label = args.trial_name,
                             restore = args.opt_restore,
                             checkpoint=args.checkpoint,
                             checkpoint_interval=args.checkpoint_interval)
        block.run()