def _process_common(args, mesh, soln, cfg): # Prefork to allow us to exec processes after MPI is initialised if hasattr(os, 'fork'): from pytools.prefork import enable_prefork enable_prefork() # Import but do not initialise MPI from mpi4py import MPI # Manually initialise MPI MPI.Init() # Ensure MPI is suitably cleaned up register_finalize_handler() # Create a backend backend = get_backend(args.backend, cfg) # Get the mapping from physical ranks to MPI ranks rallocs = get_rank_allocation(mesh, cfg) # Construct the solver solver = get_solver(backend, rallocs, mesh, soln, cfg) # If we are running interactively then create a progress bar if args.progress and MPI.COMM_WORLD.rank == 0: pb = ProgressBar(solver.tstart, solver.tcurr, solver.tend) # Register a callback to update the bar after each step callb = lambda intg: pb.advance_to(intg.tcurr) solver.completed_step_handlers.append(callb) # Execute! solver.run()
def __init__(self): if not MPI.Is_initialized(): print("Manual MPI_Init performed.") MPI.Init() self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size()
def __main__(): if hasattr(os, 'fork'): from pytools.prefork import enable_prefork enable_prefork() # Define MPI communication world from mpi4py import MPI MPI.Init() # define the local rank based cuda device print("Local rank", get_local_rank()) os.environ.pop('CUDA_DEVICE', None) devid = get_local_rank() os.environ['CUDA_DEVICE'] = str(devid) # CUDA device number (used by pycuda.autoinit) #from pycuda.autoinit import context #import pycuda.autoinit cuda.init() cudadevice = cuda.Device(devid) cudacontext = cudadevice.make_context() import atexit atexit.register(cudacontext.pop) # define the main process main() # finalize everything MPI.Finalize()
def _process_common(args, mesh, soln, cfg): # Prefork to allow us to exec processes after MPI is initialised if hasattr(os, 'fork'): from pytools.prefork import enable_prefork enable_prefork() # Import but do not initialise MPI from mpi4py import MPI # Manually initialise MPI MPI.Init() # Ensure MPI is suitably cleaned up register_finalize_handler() # Create a backend backend = get_backend(args.backend, cfg) # Get the mapping from physical ranks to MPI ranks rallocs = get_rank_allocation(mesh, cfg) # Construct the solver solver = get_solver(backend, rallocs, mesh, soln, cfg) # Execute! solver.run() # Finalise MPI MPI.Finalize()
def fit(self, training_data, validation_data=None): MPI.Init() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() for epoch in range(self.epochs): data = training_data[0] labels = training_data[1] mini_batches = self.create_batches(data, labels, self.mini_batch_size // size) for x, y in mini_batches: # doing props self.forward_prop(x) ma_nabla_b, ma_nabla_w = self.back_prop(y) # summing all ma_nabla_b and ma_nabla_w to nabla_w and nabla_b nabla_w = [] nabla_b = [] # TODO: add your code # calculate work self.weights = [ w - self.eta * dw for w, dw in zip(self.weights, nabla_w) ] self.biases = [ b - self.eta * db for b, db in zip(self.biases, nabla_b) ] self.print_progress(validation_data, epoch) MPI.Finalize()
def train(self, train_data, num_epochs, mini_batch_sz, learning_rate=0.01, test_data=None): X = train_data[0] y = train_data[1] num_examples = len(X) MPI.Init() self.sgd(X, y, num_examples, num_epochs, test_data, mini_batch_sz, learning_rate) MPI.Finalize()
def __init__(self, shape, dimensions, input_comm=None, topology=None): super(Distributor, self).__init__(shape, dimensions) if configuration['mpi']: # First time we enter here, we make sure MPI is initialized if not MPI.Is_initialized(): MPI.Init() global init_by_devito init_by_devito = True self._input_comm = (input_comm or MPI.COMM_WORLD).Clone() # Make sure the cloned communicator will be freed up upon exit def cleanup(): if self._input_comm is not None: self._input_comm.Free() atexit.register(cleanup) if topology is None: # `MPI.Compute_dims` sets the dimension sizes to be as close to each other # as possible, using an appropriate divisibility algorithm. Thus, in 3D: # * topology[0] >= topology[1] >= topology[2] # * topology[0] * topology[1] * topology[2] == self._input_comm.size # However, `MPI.Compute_dims` is distro-dependent, so we have to enforce # some properties through our own wrapper (e.g., OpenMPI v3 does not # guarantee that 9 ranks are arranged into a 3x3 grid when shape=(9, 9)) self._topology = compute_dims(self._input_comm.size, len(shape)) else: self._topology = topology if self._input_comm is not input_comm: # By default, Devito arranges processes into a cartesian topology. # MPI works with numbered dimensions and follows the C row-major # numbering of the ranks, i.e. in a 2x3 Cartesian topology (0,0) # maps to rank 0, (0,1) maps to rank 1, (0,2) maps to rank 2, (1,0) # maps to rank 3, and so on. self._comm = self._input_comm.Create_cart(self._topology) else: self._comm = input_comm else: self._input_comm = None self._comm = MPI.COMM_NULL self._topology = tuple(1 for _ in range(len(shape))) # The domain decomposition self._decomposition = [ Decomposition(np.array_split(range(i), j), c) for i, j, c in zip(shape, self.topology, self.mycoords) ]
def fit(self, training_data, validation_data=None): # MPI setup MPI.Init() self.comm = MPI.COMM_WORLD self.rank = self.comm.Get_rank() self.size = self.comm.Get_size() self.layers_per_master = self.num_layers // self.num_masters # split up work if self.rank < self.num_masters: self.do_master(validation_data) else: self.do_worker(training_data) # when all is done self.comm.Barrier() MPI.Finalize()
def fit(self, training_data, validation_data=None): MPI.Init() comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() for epoch in range(self.epochs): data = training_data[0] labels = training_data[1] mini_batches = self.create_batches(data, labels, self.mini_batch_size // size) for x, y in mini_batches: # doing props self.forward_prop(x) ma_nabla_b, ma_nabla_w = self.back_prop(y) # summing all ma_nabla_b and ma_nabla_w to nabla_w and nabla_b nabla_w = [] nabla_b = [] for mw, mb in zip(ma_nabla_w, ma_nabla_b): w = np.zeros_like(mw) b = np.zeros_like(mb) # comm.Allreduce(mw, w, op=MPI.SUM) # comm.Allreduce(mb, b, op=MPI.SUM) ringallreduce(mw, w, comm, _op) ringallreduce(mb, b, comm, _op) nabla_w.append(w) nabla_b.append(b) # calculate work self.weights = [ w - self.eta * dw for w, dw in zip(self.weights, nabla_w) ] self.biases = [ b - self.eta * db for b, db in zip(self.biases, nabla_b) ] self.print_progress(validation_data, epoch) MPI.Finalize()
def _init_region_comm(self): """ If in multi-node, this method will initialize information about MPI controllers. .. versionadded:: 0.6.0 """ if MPI is None: raise AttributeError("mpi4py is not imported") MPI.Init() self._region_comm = MPI.COMM_WORLD self._region_size = MPI.COMM_WORLD.Get_size() self._region_rank = MPI.COMM_WORLD.Get_rank() local_size = numpy.array([self._local_size], dtype='int32') self._all_local_size = numpy.zeros((self._region_size, ), dtype='int32') self._region_comm.Allgather([local_size, MPI.UNSIGNED_INT], [self._all_local_size, MPI.UNSIGNED_INT]) self._global_size = sum(self._all_local_size)
# model classes must have identic name with python file in models directory models_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), models_dir) # import GANs classes for filename in os.listdir(models_path): modulename, ext = os.path.splitext(filename) if modulename != '__pycache__' and ext == '.py': subpackage = '{0}.{1}'.format(models_dir, modulename) obj = getattr( __import__(subpackage, globals(), locals(), [modulename]), modulename, ) list_GANs.update({obj.model_name: obj}) MPI.Init() def merge_args(cmdline_args, config_args): for key in config_args.keys(): if key not in cmdline_args: sys.exit( 'Error: unknown key in the configuration file \"{}\"'.format( key)) args = {} args.update(cmdline_args) args.update(config_args) return args
def spawn(self, **kwargs): """ Spawn MPI processes for and execute each of the managed targets. Parameters ---------- kwargs: dict options for the `info` argument in mpi spawn process. see https://www.open-mpi.org/doc/v4.0/man3/MPI_Comm_spawn.3.php """ # Typcially MPI must be have intialized before spawning. if not MPI.Is_initialized(): MPI.Init() if self._is_parent: # Find the path to the mpi_backend.py script (which should be in the # same directory as this module: parent_dir = os.path.dirname(__file__) mpi_backend_path = os.path.join(parent_dir, 'mpi_backend.py') # Set spawn option. Due to --oversubscribe, we will use none in binding info = Info.Create() info.Set('bind_to', "none") for k, v in kwargs.items(): info.Set(k, v) # Spawn processes: self._intercomm = MPI.COMM_SELF.Spawn(sys.executable, args=[mpi_backend_path], maxprocs=len(self), info=info) # First, transmit twiggy logging emitters to spawned processes so # that they can configure their logging facilities: for i in self._targets: self._intercomm.send(twiggy.emitters, i) # Next, serialize the routing table ONCE and then transmit it to all # of the child nodes: try: routing_table = self.routing_table except: routing_table = RoutingTable() self.log_warning( 'Routing Table is null, using empty routing table.') self._intercomm.bcast(routing_table, root=MPI.ROOT) # Transmit class to instantiate, globals required by the class, and # the constructor arguments; the backend will wait to receive # them and then start running the targets on the appropriate nodes. req = MPI.Request() r_list = [] for i in self._targets: target_globals = all_global_vars(self._targets[i]) # Serializing atexit with dill appears to fail in virtualenvs # sometimes if atexit._exithandlers contains an unserializable function: if 'atexit' in target_globals: del target_globals['atexit'] data = (self._targets[i], target_globals, self._kwargs[i]) r_list.append(self._intercomm.isend(data, i)) # Need to clobber data to prevent all_global_vars from # including it in its output: del data req.Waitall(r_list)
def DNNT(): # Initializing the MPI and testing if it's been initialized MPI.Init() print(MPI.Is_initialized()) print(MPI.Is_finalized()) # Get Parameters generation, dataset, mutationChance, param = getParameters() # Initialize the fitness fitnessParent = -1 # The fitness of the parent fitnessChild = -1 # The fitness of the child networkFitness = -1 # The fitness of the network genBestFitness = -1 # Fitness of the generation # Initialize the classes net, ga, com, pd = initClasses(param, MPI, networkFitness) # Get the logger # filename = 'output{}.log'.format(pd.rank) filename = 'output.log' logger = logging.getLogger() handler = logging.FileHandler(filename) handler.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.DEBUG) # initialize the networks # one random network at every processor data = net.initNetwork() # Start running GA (Genetic Algorithm) generation for g in range(generation): if genBestFitness < 100: # GET PARENT FITNESS/ACCURACY # Every processor trains and evaluate the accuracy/fitness of the parent network fitnessParent = ga.getFitness(data, dataset) # BREED THE CHILD # This to be done using MPI ISend # Get the parent using Non Blocking exchange child = ga.breeding(data, mutationChance, pd.nonBlockingExchange(data)) MPI.COMM_WORLD.Barrier() # GET CHILD'S FITNESS/ACCURACY # Every processor trains and evaluate the accuracy/fitness of the child network fitnessChild = ga.getFitness(child, dataset) ''' If the network fitness has improved over previous generation, then pass on the features/hyperparameters Pass on the better of the two (parent or child) from this generation to the next generation Comparison done of the previous value at the procecssor with the new computed value ''' networkFitness, data = com.networkData(fitnessParent, fitnessChild, data, child) logger.debug( 'generation=%d, Rank=%d, processid=%s, parent=%s, child=%s, ' 'parentFitness=%0.4f, childFitness=%0.4f, networkFitness=%0.4f', g, pd.rank, socket.gethostname(), data, child, fitnessParent, fitnessChild, networkFitness) ''' Compare the fitness of the best networks of all the families Compares the fitness of all the networks data that are with all the processors in the communication Get the best fitness the generation Kill the poorest performing of the population Randomly initialize the poorest fitness population to keep the population constant ''' genBestFitness, data = com.genFitness(data, param, MPI) print(genBestFitness, data) else: # Broadcast the best results to all the processors pd.broadcast(data, pd.rank) print('best fitness achieved') # And halt MPI.Finalize() MPI.Finalize()
def INNT(): # Initializing the MPI and testing if it's been initialized MPI.Init() print(MPI.Is_initialized()) print(MPI.Is_finalized()) # Get Parameters generation, dataset, mutationChance, param, groupSize = getParameters() # Initialize the fitness fitnessParent = -1 # The fitness of the parent fitnessChild = -1 # The fitness of the child networkFitness = -1 # The fitness of the network genBestFitness = -1 # Fitness of the generation # Initialize the classes net, ga, com, pd = initClasses(param, MPI, groupSize, networkFitness) # Get the logger # filename = 'output{}.log'.format(pd.rank) filename = 'output.log' logger = logging.getLogger() handler = logging.FileHandler(filename) handler.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.DEBUG) # Split the communicator subGroup = pd.rank / groupSize subComm = MPI.Comm.Split(MPI.COMM_WORLD, subGroup, pd.rank) # initialize the networks # one random network at every processor # INITIALIZZEEEE ISLAND WITH SOME SPECIALITYYYYY data = net.initNetwork() # Islands differ in activation function # Since there will be at min 2 subgroups if pd.subGroup == 0: data['activation'] = 'sigmoid' elif pd.subGroup == 1: data['activation'] = 'elu' else: data['activation'] = 'selu' # Start running GA (Genetic Algorithm) generation for g in range(generation): if genBestFitness < 100: # GET PARENT FITNESS/ACCURACY # Every processor trains and evaluate the accuracy/fitness of the parent network fitnessParent = ga.getFitness(data, dataset) print('loop_1 done', g, pd.rank) # BREED THE CHILD # This to be done using MPI ISend # Get the parent using Non Blocking exchange child = ga.breeding(pd.rank, g, data, mutationChance, pd.intraIslandExchange(data, subComm)) MPI.COMM_WORLD.Barrier() # GET CHILD'S FITNESS/ACCURACY # Every processor trains and evaluate the accuracy/fitness of the child network fitnessChild = ga.getFitness(child, dataset) ''' If the network fitness has improved over previous generation, then pass on the features/hyperparameters Pass on the better of the two (parent or child) from this generation to the next generation Comparison done - of the previous value at the procecssor with the new computed value ''' networkFitness, data = com.networkData(fitnessParent, fitnessChild, data, child) ''' Compare the fitness of the best networks of all the families Compares the fitness of all the networks data that are with all the processors in the communication Get the best fitness the generation Kill the poorest performing of the population Randomly initialize the poorest fitness population to keep the population constant ''' genBestFitness, data = com.genFitness(data, param, MPI, groupSize) # print(genBestFitness, data) logger.debug( 'generation=%d, Rank=%d, processid=%s, group=ID%d, subRank=%d, parent=%s, child=%s, parentFitness=%0.4f, childFitness=%0.4f, networkFitness=%0.4f, genBestFitness=%0.4f', g, pd.rank, socket.gethostname(), pd.subGroup, subComm.Get_rank(), data, child, fitnessParent, fitnessChild, networkFitness, genBestFitness) ''' Do inter-island exchange after every 5 generations In this all the ranks are sending the data to the previous ranks ''' if g % 5 == 0: pd.interIslandExchange(data, subComm) print('loop_6 done', pd.rank) MPI.COMM_WORLD.Barrier() else: # Broadcast the best results to all the processors pd.broadcast(data, pd.rank) print('best fitness achieved') # And halt MPI.Finalize() MPI.Finalize()
def init_process_group(): if not MPI.Is_initialized(): MPI.Init() global _comm _comm = MPI.COMM_WORLD
def init(): if not MPI.Is_initialized(): # print "initializing..." MPI.Init() else: pass
def init(self): # Manually initialise MPI if not self.mpi_init: self.mpi_init = True MPI.Init()
#!/usr/bin/python3 from mpi4py import rc rc.initialize = False from mpi4py import MPI as mpi from time import sleep mpi.Init() comm = mpi.COMM_WORLD rank = comm.Get_rank() # if rank == 1: # sleep(2) for i in range(0, 10): if rank == 0: data = {'a': i, 'b': 3.14} print(data) # sleep(1) req = comm.isend(data, dest=(rank + 1), tag=0) # sleep(2) # req.wait() # print(rank, req.wait()) elif rank == 1: req = comm.irecv(source=(rank - 1), tag=0) print(rank, req.wait()) # data = req.wait() # while 1: # r = req.test() # if r[0]: # print(r[1])
def initialize(): if not MPI.Is_initialized(): MPI.Init() global _comm _comm = MPI.COMM_WORLD