def execTask(self, task): """Wrapper function calling mapping/reducing/finalizing phase tasks, dispatch tasks to workers until all finished and collect feedback. Faulty workers are removed from active duty work list. """ atimer = Timer(task) print( 'Entering {0:s} phase...'.format(task) ) taskDict = { 'Map':(self.mapIn, MAP_START, MAP_FINISH), \ 'Init':(self.mapIn, INIT_START, MAP_FINISH), \ 'Reduce':(self.reduceIn, REDUCE_START, REDUCE_FINISH) } # line up jobs and workers into priority queues jobs = taskDict[task][0][:] heapq.heapify(jobs); running = {} heapq.heapify(self.workers) while (jobs or running) and self.nActive > 0: # dispatch all jobs to all free workers while jobs and self.workers[0].isFree(): job = heapq.heappop(jobs) worker = heapq.heappop(self.workers) world.send(job, dest=worker.id, tag=taskDict[task][1]) worker.setBusy(); heapq.heappush(self.workers, worker) running[job] = (time(), worker) if self.config['verbosity'] >= 6: print('Dispatching file '+os.path.basename(job)+' to worker '+str(worker.id)) # if no more free workers, break if not self.workers[0].isFree(): break # wait for finishing workers as well as do cleaning self.wait(running, taskDict[task][2]) self.clean(running, jobs) print( '{0:s} phase completed'.format(task) )
def map(self, tag): """ Execute supplied mapfn on each key-value pair read from file assigned by the master node """ atimer = Timer('Worker_Map') # load key-value pairs from filename filename = world.recv(source=0, tag=tag) data = self.read(filename, tag) buffer = [ [] for ii in range(self.config['nReduce']) ] for key, val in data.items(): for newKey, newVal in self.config['mapfn'](key, val): idx = self.config['hashfn'](newKey) % self.config['nReduce'] buffer[idx].append( (newKey, newVal) ) # write out new key-value pairs in scattered files for ii in range(self.config['nReduce']): tmpfile = self.reduceIn[ii]+'-tmp'+str(world.rank) # dump in append mode with open(tmpfile, 'a+b') as fout: pickle.dump(buffer[ii], fout, pickle.HIGHEST_PROTOCOL) # report back as successful completion of task world.send(filename, dest=0, tag=MAP_FINISH)
def map(self, f, tasks): N = len(tasks) P = self.P Pless1 = P - 1 if self.rank != 0: self.wait() return if f is not self.f: self.f = f requests = [] for p in range(1, self.P): r = COMM_WORLD.isend(f, dest=p) requests.append(r) MPI.Request.waitall(requests) requests = [] for i, task in enumerate(tasks): r = COMM_WORLD.isend(task, dest=(i%Pless1)+1, tag=i) requests.append(r) MPI.Request.waitall(requests) results = [] for i in range(N): result = COMM_WORLD.recv(source=(i%Pless1)+1, tag=i) results.append(result) return results
def test_padded_kernel(self): """ Implement a simple padded kernel. """ for case in self.cases: # Form data to work on. space.initialize_space(case['shape']) x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype'])) x = Grid(x_np, x_overlap=1) s_np = comm.allreduce(np.random.randn(1).astype(case['dtype'])) s = Const(s_np) z = Out(case['dtype']) # Make a kernel. code = Template(""" if (_in_local && _in_global) { x(0,0,0) = s(0) * x(0,0,0); z += a * x(0,0,0); } """).render() fun = Kernel(code, \ ('a', 'number', case['dtype']), \ ('x', 'grid', x.dtype), \ ('s', 'const', s.dtype, s.data.size), \ ('z', 'out', z.dtype), \ padding=(1,1,1,1)) # Execute and check the result. fun(case['dtype'](2), x, s, z) gpu_sum = z.get() cpu_sum = np.sum(2.0 * s_np * x_np) err = abs(gpu_sum - cpu_sum) / abs(cpu_sum) # print case, err if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-2, (case, err)) else: self.assertTrue(err < 1e-6, (case, err))
def sync(self, key): """Synchronize dataType with all worker nodes. """ tagDict = { 'Map': (UPDATE_MAP, self.mapIn),\ 'Reduce': (UPDATE_REDUCE, self.reduceIn),\ 'Config': (UPDATE_CONFIG, self.config) } for ii in range(1, len(self.workers)+1): world.send(tagDict[key][1], dest=ii, tag=tagDict[key][0])
def wait(self): if self.rank == 0: raise RuntimeError("Proc 0 cannot wait!") status = MPI.Status() while True: task = COMM_WORLD.recv(source=0, tag=MPI.ANY_TAG, status=status) if not task: break if isinstance(task, FunctionType): self.f = task continue result = self.f(task) COMM_WORLD.isend(result, dest=0, tag=status.tag)
def update(self, tag): """Update file list and global configurations """ atimer = Timer('Worker_Update') if tag == UPDATE_MAP: self.mapIn = world.recv(source=0, tag=tag) elif tag == UPDATE_REDUCE: self.reduceIn = world.recv(source=0, tag=tag) self.reduceOut = [ os.path.splitext(file)[0]+'.red' for file in self.reduceIn ] elif tag == UPDATE_CONFIG: self.config = world.recv(source=0, tag=tag) else: raise ValueError('Wrong tag specified.')
def get_cpu_raw(cpu_data, k): # Make sure overlapped data is accurate as well. xr = space.get_space_info()['x_range'] if comm.Get_rank() == 0: pad_back = cpu_data[-k:, :, :] else: pad_back = cpu_data[xr[0] - k:xr[0], :, :] if comm.Get_rank() == comm.Get_size() - 1: pad_front = cpu_data[:k, :, :] else: pad_front = cpu_data[xr[1]:xr[1] + k, :, :] return np.concatenate((pad_back, cpu_data[xr[0]:xr[1],:,:], \ pad_front), axis=0)
def wait(self, running, tag): """Test if any worker has finished its job. If so, decrease its key and make it available """ atimer = Timer('Wait') inittime = time() status = MPI.Status() while time() - inittime < self.config['jobwait']: if world.Iprobe(source=MPI.ANY_SOURCE,tag=tag,status=status): jobf = world.recv(source=status.source, tag=tag) idx = 0 for ii, worker in enumerate(self.workers): if worker.id == status.source: idx = ii; break if self.config['verbosity'] >= 8: print('Freeing worker '+str(self.workers[idx].id)) worker = self.workers[idx] # faulty worker's job has already been cleaned if not worker.isFaulty(): del running[jobf] else: self.nActive += 1 worker.setFree() heapq._siftup(self.workers, idx)
def _init_gpu(comm): """ Chooses a gpu and creates a context on it. """ # Find out how many GPUs are available to us on this node. driver.init() num_gpus = driver.Device.count() # Figure out the names of the other hosts. rank = comm.Get_rank() # Find out which process I am. name = MPI.Get_processor_name() # The name of my node. hosts = comm.allgather(name) # Get the names of all the other hosts # Find out which GPU to take (by precedence). gpu_id = hosts[0:rank].count(name) if gpu_id >= num_gpus: raise TypeError("No GPU available.") # Create a context on the appropriate device. for k in range(num_gpus): try: device = driver.Device((gpu_id + k) % num_gpus) context = device.make_context() except: continue else: # print "On %s: process %d taking gpu %d of %d.\n" % \ # (name, rank, gpu_id+k, num_gpus) break return device, context # Return device and context.
def test_batch_sum(self): """ Make sure batch summing works. """ num_outs = 3 for case in self.cases: space.initialize_space(case['shape']) x = [Out(case['dtype'], op='sum') for k in range(num_outs)] x_cpu_data = [np.random.randn(*case['shape'][1:])\ .astype(case['dtype']) for k in range(num_outs)] if case['dtype'] in (np.complex64, np.complex128): for k in range(num_outs): x_cpu_data[k] = (1 + 1j) * x_cpu_data[k] res_gold = [] for k in range(num_outs): x[k].data.set(x_cpu_data[k]) res_gold.append(comm.allreduce(np.sum(x_cpu_data[k].flatten()))) batch_reduce(*x) res_gpu = [x_indiv.get() for x_indiv in x] for k in range(num_outs): err = abs(res_gold[k] - res_gpu[k]) / abs(res_gold[k]) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def update(self, tag): """Update file list and global configurations """ atimer = Timer('Worker_Update') if tag == UPDATE_MAP: self.mapIn = world.recv(source=0, tag=tag) elif tag == UPDATE_REDUCE: self.reduceIn = world.recv(source=0, tag=tag) self.reduceOut = [ os.path.splitext(file)[0] + '.red' for file in self.reduceIn ] elif tag == UPDATE_CONFIG: self.config = world.recv(source=0, tag=tag) else: raise ValueError('Wrong tag specified.')
def get(self): """ Redefined so that we don't get overlap data. """ # Get our section of the grid (excluding overlap). if self._xlap is 0: data = self.data.get() else: data = self.data.get()[self._xlap:-self._xlap,:,:] # return np.concatenate(comm.allgather(data), axis=0) # Super-simple. result = comm.gather(data) # Gather all peices to root. if comm.Get_rank() == 0: # Root node glues everything together. return np.concatenate(result, axis=0) else: return None
def debug(*s): import sys from mpi4py.MPI import COMM_WORLD print('[rank:{}]'.format(COMM_WORLD.Get_rank()), *s, file=sys.stderr, flush=True)
def run(self): """Receiving job instructions from the master node until TERMINATE signal received. Allowed tasks are defined in taskDict """ atimer = Timer('Worker') # tasks define signal-behavior in the run function taskDict = { MAP_START: self.map, REDUCE_START: self.reduce,\ INIT_START: self.map,\ UPDATE_MAP: self.update, UPDATE_REDUCE: self.update,\ UPDATE_CONFIG: self.update } status = MPI.Status() while True: # ping input if not world.Iprobe(source=0, tag=MPI.ANY_TAG, status=status): sleep(self.config['delay']); # entire calculation finished elif status.tag == TERMINATE: term = world.recv(source=0, tag=TERMINATE); break # check allowed tasks elif status.tag in taskDict: taskDict[status.tag](status.tag); # no instruction found, looping else: sleep(self.config['delay'])
def main(): args = parse_args() assert args.pretrained_model_path is None or args.pretrained_model_path.endswith( ".ckpt") os.makedirs(args.save_dir, exist_ok=True) save_args(args, args.save_dir) set_seed(args.seed + COMM_WORLD.Get_rank() * 100) nprocs = COMM_WORLD.Get_size() # Initialize model and agent policy aurora = Aurora(args.seed + COMM_WORLD.Get_rank() * 100, args.save_dir, int(7200 / nprocs), args.pretrained_model_path, tensorboard_log=args.tensorboard_log) # training_traces, validation_traces, training_traces = [] val_traces = [] if args.train_trace_file: with open(args.train_trace_file, 'r') as f: for line in f: line = line.strip() training_traces.append(Trace.load_from_file(line)) if args.val_trace_file: with open(args.val_trace_file, 'r') as f: for line in f: line = line.strip() if args.dataset == 'pantheon': queue = 100 # dummy value # if "ethernet" in line: # queue = 500 # elif "cellular" in line: # queue = 50 # else: # queue = 100 val_traces.append(Trace.load_from_pantheon_file( line, queue=queue, loss=0)) elif args.dataset == 'synthetic': val_traces.append(Trace.load_from_file(line)) else: raise ValueError aurora.train(args.randomization_range_file, args.total_timesteps, tot_trace_cnt=args.total_trace_count, tb_log_name=args.exp_name, validation_flag=args.validation, training_traces=training_traces, validation_traces=val_traces, real_trace_prob=args.real_trace_prob)
def main(argv=None): args = process_command_line(argv) # note that in MPI mode, lengths will be global, whereas data will # be local (i.e. only this node's data). lengths, data = load_trjs_or_features(args) kwargs = {} if args.cluster_iterations is not None: kwargs['kmedoids_updates'] = int(args.cluster_iterations) clustering = args.Clusterer(metric=args.cluster_distance, n_clusters=args.cluster_number, cluster_radius=args.cluster_radius, mpi_mode=mpi_mode, **kwargs) clustering.fit(data) # release the RAM held by the trajectories (we don't need it anymore) del data logger.info("Clustered %s frames into %s clusters in %s seconds.", sum(lengths), len(clustering.centers_), clustering.runtime_) result = clustering.result_ if mpi_mode: local_ctr_inds, local_dists, local_assigs = \ result.center_indices, result.distances, result.assignments with timed("Reassembled dist and assign arrays in %.2f sec", logging.info): all_dists = mpi.ops.assemble_striped_ragged_array( local_dists, lengths) all_assigs = mpi.ops.assemble_striped_ragged_array( local_assigs, lengths) ctr_inds = mpi.ops.convert_local_indices(local_ctr_inds, lengths) result = ClusterResult(center_indices=ctr_inds, distances=all_dists, assignments=all_assigs, centers=result.centers) result = result.partition(lengths) if mpi.rank() == 0: with timed("Wrote center indices in %.2f sec.", logger.info): write_centers_indices(args.center_indices, [(t, f * args.subsample) for t, f in result.center_indices]) with timed("Wrote center structures in %.2f sec.", logger.info): write_centers(result, args) write_assignments_and_distances_with_reassign(result, args) mpi.comm.barrier() logger.info("Success! Data can be found in %s.", os.path.dirname(args.distances)) return 0
def RandomSize(N_lower, N_upper): if COMM_WORLD.rank == 0: size = randint(N_lower, N_upper) else: size = None size = COMM_WORLD.bcast(size, root=0) assert size is not None return size
def RandomNumber(): if COMM_WORLD.rank == 0: number = _rand(1)[0] else: number = None number = COMM_WORLD.bcast(number, root=0) assert number is not None return number
def RandomTuple(Q): if COMM_WORLD.rank == 0: tuple_ = tuple(float(v) for v in _rand(Q)) else: tuple_ = None tuple_ = COMM_WORLD.bcast(tuple_, root=0) assert tuple_ is not None return tuple_
def __init__(self): """Obtain configurations and all filelists from the master node """ assert world.rank >= 1 # synchronize configuration file and file lists with the master node self.config = {}; self.mapIn = []; self.reduceIn = []; self.reduceOut = [] self.config = world.bcast(self.config, root=0)
def simulate(name, check_success_only=False): """ Read simulation from input file, simulate, and write out results. """ # Reset the environment variables pointing to the temporary directory. tempfile.tempdir = '/tmp' # Create the reporter function. write_status = lambda msg: open(name + '.status', 'a').write(msg) if comm.Get_rank() == 0: # write_status('EXEC initializing\n') def rep(err): write_status('%e\n' % err) else: # No reporting needed for non-root nodes. def rep(err): pass # Get input parameters. params = get_parameters(name) # Define operations needed for the lumped bicg operation. b, x, ops, post_cond = maxwell_ops_lumped.ops(params) # Solve! start_time = time.time() rep.stime = start_time x, err, success = bicg.solve_symm_lumped(b, x=x, \ max_iters=params['max_iters'], \ reporter=rep, \ err_thresh=params['err_thresh'], \ **ops) if check_success_only: # Don't write output, just see if we got a success. return success # Gather results onto root's host memory. result = { 'E': [E.get() for E in x], \ 'err': err, \ 'success': success} # Write results to output file. if comm.Get_rank() == 0: result['E'] = post_cond(result['E']) # Apply postconditioner. write_results(name, result) return success
def array(self, N=0, filename=None, component=None, root=0): """Dump data to numpy format on root processor.""" assert(N == 0 or N == 1) is_root = comm.Get_rank() == root size = self.get_total_number_probes() if is_root else len(self) comp = self.value_size() if component is None else 1 z = zeros((size, comp)) # Retrieve all values if len(self) > 0: for k in range(comp): if is_root: ids = self.get_probe_ids() z[ids, k] = self.get_probes_component_and_snapshot(k, N) else: z[:, k] = self.get_probes_component_and_snapshot(k, N) # Collect on root recvfrom = comm.gather(len(self), root=root) if is_root: for j, k in enumerate(recvfrom): if comm.Get_rank() != j: ids = comm.recv(source=j, tag=101) z0 = comm.recv(source=j, tag=102) z[ids, :] = z0[:, :] else: ids = self.get_probe_ids() comm.send(ids, dest=root, tag=101) comm.send(z, dest=root, tag=102) if is_root: if filename: z.dump(filename+"_statistics.probes") return squeeze(z)
def simulate(N, D, S, G, dt): x0, v0, m = initial_cond(N, D) pool = Pool() if COMM_WORLD.Get_rank() == 0: for s in range(S): x1, v1 = timestep(x0, v0, G, m, dt, pool) x0, v0 = x1, v1 else: pool.wait()
def array(self, N=None, filename=None, component=None, root=0): """Dump data to numpy format on root processor for all or one snapshot.""" is_root = comm.Get_rank() == root size = self.get_total_number_probes() if is_root else len(self) comp = self.value_size() if component is None else 1 if not N is None: z = zeros((size, comp)) else: z = zeros((size, comp, self.number_of_evaluations())) # Get all values if len(self) > 0: if not N is None: for k in range(comp): if is_root: ids = self.get_probe_ids() z[ids, k] = self.get_probes_component_and_snapshot(k, N) else: z[:, k] = self.get_probes_component_and_snapshot(k, N) else: for i, (index, probe) in enumerate(self): j = index if is_root else i if not N is None: z[j, :] = probe.get_probe_at_snapshot(N) else: for k in range(self.value_size()): z[j, k, :] = probe.get_probe_sub(k) # Collect values on root recvfrom = comm.gather(len(self), root=root) if is_root: for j, k in enumerate(recvfrom): if comm.Get_rank() != j: ids = comm.recv(source=j, tag=101) z0 = comm.recv(source=j, tag=102) z[ids, :] = z0[:, :] else: ids = self.get_probe_ids() comm.send(ids, dest=root, tag=101) comm.send(z, dest=root, tag=102) if is_root: if filename: if not N is None: save(filename + "_snapshot_" + str(N), z) else: save(filename + "_all", z) return squeeze(z)
def exchange_guard_cells( physical_F_left, physical_F_right ): # MPI exchanges of guard cells # Send physical cell to left proc req1 = mpi_comm.isend( physical_F_left, dest=(mpi_comm.rank-1)%mpi_comm.size ) # Send physical cell to right proc req2 = mpi_comm.isend( physical_F_right, dest=(mpi_comm.rank+1)%mpi_comm.size ) # Receive value from right proc req3 = mpi_comm.irecv( source=(mpi_comm.rank+1)%mpi_comm.size ) # Receive value from left proc req4 = mpi_comm.irecv( source=(mpi_comm.rank-1)%mpi_comm.size ) # Wait for the processors to finish sending/receiving req1.wait() req2.wait() F_from_right = req3.wait() F_from_left = req4.wait() return F_from_left, F_from_right
def all_reduce(tensor: torch.Tensor, op=ReduceOp.SUM, comm: MPI.COMM_WORLD = None) -> torch.Tensor: param_numpy = tensor.numpy() param_output = np.empty(param_numpy.shape, dtype=param_numpy.dtype) if comm is None: comm = _get_comm() comm.Allreduce(param_numpy, param_output, op=op.value) tensor = torch.from_numpy(param_output) return tensor
def _build_cat_distributed(comm, name, path): # Control flow explanation: # * `build_err` starts out as `None` # * Rank 1 to N wait for a broadcast from rank 0 to receive the new value # for `build_err` # * Rank 0 splits off from the others and executes the build. # * If it builds correctly it finishes the collective `build_err` # broadcast with the initial value `None`: all nodes continue. # * If it errors, it finishes the collective broadcast with the caught err # # All MPI ranks either continue or raise the same err. (prevents stalling) build_err = None if not comm.Get_rank(): try: _build_cat_local(name, path) except Exception as e: build_err = e build_err = comm.bcast(build_err, root=0) if build_err: raise build_err
def test_partition(self): """ Make sure the x_ranges span the entire space without any gaps. """ shapes = ((200,30,10), (33,10,10), (130,5,5), (111,2,2)) for shape in shapes: space.initialize_space(shape) x = comm.gather(space.get_space_info()['x_range']) if comm.Get_rank() == 0: self.assertEqual(x[0][0], 0) self.assertEqual(x[-1][-1], space.get_space_info()['shape'][0]) for k in range(len(x)-1): self.assertEqual(x[k][1], x[k+1][0])
def __init__(self): """Obtain configurations and all filelists from the master node """ assert world.rank >= 1 # synchronize configuration file and file lists with the master node self.config = {} self.mapIn = [] self.reduceIn = [] self.reduceOut = [] self.config = world.bcast(self.config, root=0)
def run(self): """Running sequence of the master node """ atimer = Timer('Master') while self.iterCtrl(): # map phase if self.init: self.execTask('Init') self.init = False else: self.execTask('Map') # reduce phase self.execTask('Reduce') # terminate all workers for ii in range(1, len(self.workers)+1): world.send(True, dest=ii, tag=TERMINATE) # final output, serial execution only self.finalize()
def run(self): """Running sequence of the master node """ atimer = Timer('Master') while self.iterCtrl(): # map phase if self.init: self.execTask('Init') self.init = False else: self.execTask('Map') # reduce phase self.execTask('Reduce') # terminate all workers for ii in range(1, len(self.workers) + 1): world.send(True, dest=ii, tag=TERMINATE) # final output, serial execution only self.finalize()
def test_recover(self): """ Make sure we can store and retrieve information from the GPU. """ for case in self.cases: space.initialize_space(case['shape']) data = np.random.randn(*case['shape']).astype(case['dtype']) cpu_data = np.empty_like(data) comm.Allreduce(data, cpu_data) g = Grid(cpu_data) gpu_data = g.get() if comm.Get_rank() == 0: self.assertTrue((cpu_data == gpu_data).all()) # Test with-overlap cases as well. for k in range(1, 3): g = Grid(cpu_data, x_overlap=k) gpu_data = g.get() if comm.Get_rank() == 0: self.assertTrue((cpu_data == gpu_data).all()) cpu_raw = get_cpu_raw(cpu_data, k) self.assertTrue((cpu_raw == g._get_raw()).all())
def plot_fields( self, save_figure=False ): """ Plot the Ex and By field using matplotlib If save_figure is True, the plots are saved as PNG files, in a folder named `diagnostics` """ # PLOTTING: NEW LINES RELATED TO MPI Ex_from_all_procs = mpi_comm.gather( self.Ex[1:-1] ) By_from_all_procs = mpi_comm.gather( self.By[1:-1] ) if mpi_comm.rank == 0: print( 'Plotting the fields at iteration %d' %self.n ) global_Ex = np.concatenate( Ex_from_all_procs ) global_By = np.concatenate( By_from_all_procs ) plt.clf() plt.suptitle('Fields at iteration %d' %self.n) # Plot of Ex plt.subplot(211) z = self.dz*np.arange( self.Nz_global ) plt.plot( z, global_Ex, '-' ) plt.ylim(-1.1, 1.1) plt.xlim(0, self.Lz) plt.ylabel('$E_x^n$') plt.xlabel('z') # Plot of By plt.subplot(212) z = self.dz*np.arange( self.Nz_global ) + 0.5*self.dz plt.plot( z, global_By, '-' ) plt.ylim(-1.1/c, 1.1/c) plt.xlim(0, self.Lz) plt.ylabel('$B_y^{n-1/2}$') plt.xlabel('z') if save_figure is True: # Check that the diagnostics folder exists if os.path.exists('diagnostics') is False: os.mkdir('diagnostics') plt.savefig( "diagnostics/iteration%03d.png" %self.n)
def all_reduce(tensor: EagerTensor, op=ReduceOp.SUM, comm: MPI.COMM_WORLD = None) -> EagerTensor: param_numpy = tensor.numpy() original_shape = param_numpy.shape param_numpy_flatten = param_numpy.flatten() param_output = np.empty(param_numpy_flatten.shape, dtype=param_numpy.dtype) if comm is None: comm = _get_comm() comm.Allreduce(param_numpy_flatten, param_output, op=op.value) param_output = np.reshape(param_output, original_shape) tensor = tutils.to_tensor(param_output) return tensor
def test_simple_kernel(self): """ Implement a simple kernel. """ for case in self.cases: # Form data to work on. space.initialize_space(case['shape']) x_np = comm.allreduce(np.random.randn(*case['shape']).astype(case['dtype'])) x = Grid(x_np, x_overlap=2) s_np = comm.allreduce(np.random.randn(case['shape'][0],1,1).astype(case['dtype'])) s = Const(s_np) z = Out(case['dtype']) # Make a kernel. code = Template(""" if (_in_local && _in_global) { z += a * s(_X) * x(0,0,0); // z += a * x(0,0,0); } """).render() fun = Kernel(code, \ ('a', 'number', case['dtype']), \ ('x', 'grid', x.dtype), \ ('s', 'const', s.dtype), \ ('z', 'out', z.dtype), \ shape_filter='all') # Execute and check the result. # fun() while fun.exec_configs: # for k in range(40): fun(case['dtype'](2.0), x, s, z) # fun(case['dtype'](2.0), x, z) gpu_sum = z.get() cpu_sum = np.sum(2 * s_np * x_np) # cpu_sum = np.sum(2 * x_np) err = abs(gpu_sum - cpu_sum) / abs(cpu_sum) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-2, (case, err)) else: self.assertTrue(err < 1e-6, (case, err))
def plot(x, psi, psi_0, nt, v): ### rcParams["figure.figsize"] = [8 / mpi.Get_size(), 5] ### pyplot.step(x, psi_0(x), label='initial', where='mid') pyplot.step(x, psi_0(x - v * nt), label='analytical', where='mid') pyplot.step(x, psi, label='numerical', where='mid') pyplot.grid() pyplot.gca().set_ylim([0, 12]) pyplot.legend() ### # pyplot.savefig("out.svg") pyplot.savefig(f"out.{mpi.Get_rank()}.svg")
def __init__(self, config): """Read in user created config module and initialize the master node """ # set default values and update according to user input (config) # NOTE: input files should be prepared in the user module # (e.g. split the BIG file into smaller chunks using 'split') # each file is fed into a mapper, supposing it can fit into mapper's memory assert hasattr(config, 'mapfn') and hasattr(config, 'reducefn') self.config = {'nReduce':1, 'nMap':1, 'maxLoop':1, 'appendReduce':True,\ 'scratchFolder':'./', 'readPickle':False, 'writePickle':False,\ 'verbosity':6, 'timeout':60, 'delay':0.2, 'jobwait':1,\ 'mapfn':config.mapfn, 'reducefn':config.reducefn, 'ctrlfn':None,\ 'finalfn':None, 'readfn':None, 'hashfn':hash } if world.size == 1: raise AttributeError('Parallel mode only! At least one worker node is required.') # number of mapping tasks by default equals number of initial files # it can be overidden by user input assert isinstance(config.initFiles, list) self.config['nMap'] = len(config.initFiles) self.initFiles = config.initFiles # read in user defined configurations for key, val in self.config.items(): if hasattr(config, key): self.config[key] = getattr(config, key) # sync config with all nodes self.config = world.bcast(self.config, root=0) # setup workers into a priority queue self.workers = [ State(ii) for ii in range(1, world.size) ] heapq.heapify(self.workers) self.nActive = world.size - 1 # assign map / reduce / finalize file list tmpList = [ config.__name__+'_'+str(ii).zfill(len(str(self.config['nMap'])))\ +'.map' for ii in range(1, self.config['nMap']+1) ] self.mapIn = [ os.path.join(self.config['scratchFolder'], file) for file in tmpList ] tmpList = [ config.__name__+'_'+str(ii).zfill(len(str(self.config['nReduce'])))\ +'.int' for ii in range(1, self.config['nReduce']+1) ] self.reduceIn = [ os.path.join(self.config['scratchFolder'], file) for file in tmpList ] self.reduceOut = [ os.path.splitext(file)[0]+'.red' for file in self.reduceIn ] # Currently only support single output file self.finalOut = [ config.__name__+'.out' ] # count number of iterations self.nLoop = 0; self.init = True
def execTask(self, task): """Wrapper function calling mapping/reducing/finalizing phase tasks, dispatch tasks to workers until all finished and collect feedback. Faulty workers are removed from active duty work list. """ atimer = Timer(task) print('Entering {0:s} phase...'.format(task)) taskDict = { 'Map':(self.mapIn, MAP_START, MAP_FINISH), \ 'Init':(self.mapIn, INIT_START, MAP_FINISH), \ 'Reduce':(self.reduceIn, REDUCE_START, REDUCE_FINISH) } # line up jobs and workers into priority queues jobs = taskDict[task][0][:] heapq.heapify(jobs) running = {} heapq.heapify(self.workers) while (jobs or running) and self.nActive > 0: # dispatch all jobs to all free workers while jobs and self.workers[0].isFree(): job = heapq.heappop(jobs) worker = heapq.heappop(self.workers) world.send(job, dest=worker.id, tag=taskDict[task][1]) worker.setBusy() heapq.heappush(self.workers, worker) running[job] = (time(), worker) if self.config['verbosity'] >= 6: print('Dispatching file ' + os.path.basename(job) + ' to worker ' + str(worker.id)) # if no more free workers, break if not self.workers[0].isFree(): break # wait for finishing workers as well as do cleaning self.wait(running, taskDict[task][2]) self.clean(running, jobs) print('{0:s} phase completed'.format(task))
def reduce(self, tag): """Use supplied reducefn to operate on a list of values from a given key, generated by self.map() """ atimer = Timer('Worker_Reduce') filename = world.recv(source=0, tag=tag) files = glob.glob(filename + '-tmp*') dataList = [] for file in files: with open(file, 'rb') as fin: try: while True: dataList.extend(pickle.load(fin)) except EOFError: # read in every instance of pickle dump pass data = {} for key, val in dataList: if key in data: data[key].append(val) else: data[key] = [val] results = [] for key, values in data.items(): results.append((key, self.config['reducefn'](key, values))) results.sort(key=itemgetter(0)) # write out in dictionary format idx = self.reduceIn.index(filename) if self.config['appendReduce']: with open(self.reduceOut[idx], 'a+') as fout: pickle.dump(dict(results), fout, pickle.HIGHEST_PROTOCOL) else: with open(self.reduceOut[idx], 'w+') as fout: pickle.dump(dict(results), fout, pickle.HIGHEST_PROTOCOL) world.send(filename, dest=0, tag=REDUCE_FINISH)
def process_dir(indir, outdir): main_text_files = glob.glob("{0}/main/*.txt".format(indir)) rank = world.Get_rank() size = world.Get_size() main_text_files_2 = [] for m in main_text_files: tilename = find_tilename(m) out_main = "{0}/main/{1}.fits".format(outdir, tilename) out_epoch = "{0}/epoch/{1}.fits".format(outdir, tilename) if not (os.path.exists(out_main) and os.path.exists(out_epoch)): main_text_files_2.append(m) main_text_files = main_text_files_2 print "{0} files left to do".format(len(main_text_files)) for i, main_text_file in enumerate(main_text_files): if i % size != rank: continue print rank, main_text_file tilename = find_tilename(main_text_file) epoch_text_file = "{0}/epoch/{1}.epoch.txt".format(indir, tilename) out_main = "{0}/main/{1}.fits".format(outdir, tilename) out_epoch = "{0}/epoch/{1}.fits".format(outdir, tilename) if os.path.exists(out_main) and os.path.exists(out_epoch): continue try: process_text(main_text_file, epoch_text_file, out_main, out_epoch, "r", blind=False, quiet=False, report=report) except: print "{} did not work".format(out_main) if report: return
def test_synchronize(self): """ Make sure that we can make the overlap spaces accurate. """ for case in self.cases: space.initialize_space(case['shape']) data = np.random.randn(*case['shape']).astype(case['dtype']) cpu_data = np.empty_like(data) comm.Allreduce(data, cpu_data) g = Grid(case['dtype']) self.assertRaises(TypeError, g.synchronize) # No overlap. # Test with-overlap cases as well. for k in range(1, 4): g = Grid(case['dtype'], x_overlap=k) # Overwrite entire grid data = np.random.randn(*case['shape']).astype(case['dtype']) cpu_data = np.empty_like(data) comm.Allreduce(data, cpu_data) cpu_raw_bad = get_cpu_raw(cpu_data, k) cpu_raw_bad[:k, :, :] += 1 # Mess up padding areas. cpu_raw_bad[-k:, :, :] += 1 drv.memcpy_htod(g.data.ptr, cpu_raw_bad) # Prove that the data is not synchronized at this time. cpu_raw = get_cpu_raw(cpu_data, k) xx = case['shape'][0] gd = g._get_raw() self.assertTrue((gd[:k, :, :] != cpu_raw[:k, :, :]).all()) self.assertTrue((gd[-k:, :, :] != cpu_raw[-k:, :, :]).all()) g.synchronize() # Synchronize the overlapping data. # Make sure that the overlap data is accurate. gd = g._get_raw() self.assertTrue((gd[:k, :, :] == cpu_raw[:k, :, :]).all()) self.assertTrue((gd[-k:, :, :] == cpu_raw[-k:, :, :]).all()) comm.Barrier() # Wait for other mpi nodes to finish.
def main(nt, nx, dt, C, x_min, x_max): dx = (x_max - x_min) / nx ### size = mpi.Get_size() rank = mpi.Get_rank() # dla nx=5 i size=3: lepiej 2+2+1 niż 1+1+3 import math nx_max = math.ceil(nx / size) nx = nx_max if (rank + 1) * nx_max <= nx else nx - rank * nx_max assert nx > 0 x_min += dx * nx_max * rank x_max = min(x_max, x_min + dx * nx_max) #print(rank, '/', size, ':', nx, x_min, x_max) ### x = np.linspace(x_min - halo * dx, x_max + halo * dx, num=nx + 2 * halo, endpoint=False) psi = calc(psi_0(x), nt, C) plot(x[halo:-halo], psi[halo:-halo], psi_0, nt, v=C / dt * dx)
def reduce(self, tag): """Use supplied reducefn to operate on a list of values from a given key, generated by self.map() """ atimer = Timer('Worker_Reduce') filename = world.recv(source=0, tag=tag) files = glob.glob(filename+'-tmp*') dataList = [] for file in files: with open(file, 'rb') as fin: try: while True: dataList.extend( pickle.load(fin) ) except EOFError: # read in every instance of pickle dump pass data = {} for key, val in dataList: if key in data: data[key].append(val) else: data[key] = [val] results = [] for key, values in data.items(): results.append( ( key, self.config['reducefn'](key, values) ) ) results.sort(key=itemgetter(0)) # write out in dictionary format idx = self.reduceIn.index(filename) if self.config['appendReduce']: with open(self.reduceOut[idx], 'a+') as fout: pickle.dump(dict(results), fout, pickle.HIGHEST_PROTOCOL) else: with open(self.reduceOut[idx], 'w+') as fout: pickle.dump(dict(results), fout, pickle.HIGHEST_PROTOCOL) world.send(filename, dest=0, tag=REDUCE_FINISH)
def array(self, N=None, filename=None, component=None, root=0): """Dump data to numpy format on root processor for all or one snapshot.""" is_root = comm.Get_rank() == root size = self.get_total_number_probes() if is_root else len(self) comp = self.value_size() if component is None else 1 if not N is None: z = zeros((size, comp)) else: z = zeros((size, comp, self.number_of_evaluations())) # Get all values if len(self) > 0: if not N is None: for k in range(comp): if is_root: ids = self.get_probe_ids() z[ids, k] = self.get_probes_component_and_snapshot(k, N) else: z[:, k] = self.get_probes_component_and_snapshot(k, N) else: for i, (index, probe) in enumerate(self): j = index if is_root else i if not N is None: z[j, :] = probe.get_probe_at_snapshot(N) else: for k in range(self.value_size()): z[j, k, :] = probe.get_probe_sub(k) # Collect values on root recvfrom = comm.gather(len(self), root=root) if is_root: for j, k in enumerate(recvfrom): if comm.Get_rank() != j: ids = comm.recv(source=j, tag=101) z0 = comm.recv(source=j, tag=102) z[ids, :] = z0[:, :] else: ids = self.get_probe_ids() comm.send(ids, dest=root, tag=101) comm.send(z, dest=root, tag=102) if is_root: if filename: if not N is None: z.dump(filename+"_snapshot_"+str(N)+".probes") else: z.dump(filename+"_all.probes") return squeeze(z)
def __init__(self, N=(256, ), x0=(0.0, ), x1=(1.0, )): assert len(N) is len(x0) is len(x1) try: from mpi4py.MPI import COMM_WORLD, Compute_dims except ImportError: print "Error! DistributedDomain requires the mpi4py package." exit() mpi_sizes = Compute_dims(COMM_WORLD.size, len(N)) cart = COMM_WORLD.Create_cart(mpi_sizes, periods=[True for n in N]) mpi_coord = cart.Get_coords(COMM_WORLD.rank) global_shape = [n for n in N] global_start = [0 for n in N] local_shape = [] X0, X1 = [], [] dx = [float(l1 - l0) / n for n, l0, l1 in zip(N, x0, x1)] for i in range(len(N)): R = N[i] % mpi_sizes[i] normal_size = N[i] / mpi_sizes[i] augmnt_size = normal_size + 1 thisdm_size = augmnt_size if mpi_coord[i] < R else normal_size for j in range(mpi_coord[i]): global_start[i] += augmnt_size if j < R else normal_size local_shape.append(thisdm_size) X0.append(x0[i] + dx[i] * global_start[i]) X1.append(x0[i] + dx[i] * (global_start[i] + thisdm_size)) self.N = local_shape self.dx = dx self.x0 = X0 self.x1 = X1 self.cart = cart self.rank = COMM_WORLD.rank self.mpi_coord = mpi_coord self.mpi_sizes = mpi_sizes self.global_start = global_start self.global_shape = global_shape self.is_distributed = True
def test_sum(self): """ Make sure summing works. """ for case in self.cases: space.initialize_space(case['shape']) x = Out(case['dtype'], op='sum') x_cpu_data = np.random.randn(*case['shape'][1:]).astype(case['dtype']) if case['dtype'] in (np.complex64, np.complex128): x_cpu_data = (1 + 1j) * x_cpu_data x.data.set(x_cpu_data) res_gold = comm.allreduce(np.sum(x_cpu_data.flatten())) x.reduce() err = abs(res_gold - x.get()) / abs(res_gold) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def test_sum(self): """ Make sure summing works. """ for case in self.cases: space.initialize_space(case['shape']) x = Out(case['dtype'], op='sum') x_cpu_data = np.random.randn(*case['shape'][1:]).astype( case['dtype']) if case['dtype'] in (np.complex64, np.complex128): x_cpu_data = (1 + 1j) * x_cpu_data x.data.set(x_cpu_data) res_gold = comm.allreduce(np.sum(x_cpu_data.flatten())) x.reduce() err = abs(res_gold - x.get()) / abs(res_gold) if case['dtype'] in (np.float32, np.complex64): self.assertTrue(err < 1e-3) else: self.assertTrue(err < 1e-10)
def set_periodic_checkpoint(sim, period): """ Set up periodic checkpoints of the simulation The checkpoints are saved in openPMD format, in the directory `./checkpoints`, with one subdirectory per process. All the field and particle information of each processor is saved. NB: Checkpoints are registered among the list of diagnostics `diags` of the Simulation object `sim`. Parameters ---------- sim: a Simulation object The simulation that is to be saved in checkpoints period: integer The number of PIC iteration between each checkpoint. """ # Only processor 0 creates a directory where checkpoints will be stored # Make sure that all processors wait until this directory is created # (Use the global MPI communicator instead of the `BoundaryCommunicator` # so that this still works in the case `use_all_ranks=False`) if comm.rank == 0: if os.path.exists('./checkpoints') is False: os.mkdir('./checkpoints') comm.Barrier() # Choose the name of the directory: one directory per processor write_dir = 'checkpoints/proc%d/' % comm.rank # Register a periodic FieldDiagnostic in the diagnostics of the simulation sim.diags.append(FieldDiagnostic(period, sim.fld, write_dir=write_dir)) # Register a periodic ParticleDiagnostic, which contains all # the particles which are present in the simulation particle_dict = {} for i in range(len(sim.ptcl)): particle_dict['species %d' % i] = sim.ptcl[i] sim.diags.append( ParticleDiagnostic(period, particle_dict, write_dir=write_dir))
maf.bcast = bcast def compare(msg, mpi_func, maf_func, count=100): mpi_times = numpy.zeros(count) maf_times = numpy.zeros(count) for i in range(count): start = timeit.default_timer() mpi_func() mpi_times[i] = timeit.default_timer() - start start = timeit.default_timer() maf_func() maf_times[i] = timeit.default_timer() - start maf.log('{:<20} {:.4e} {:.4e} {:>6.2f}'.format(msg, mpi_times.mean(), maf_times.mean(), 100*((maf_times - mpi_times) / mpi_times).mean())) compare('bcast(s)', lambda: COMM_WORLD.bcast('s'), lambda: maf.bcast('s')) compare('bcast(1)', lambda: COMM_WORLD.bcast(1), lambda: maf.bcast(1)) compare('bcast(s*100)', lambda: COMM_WORLD.bcast('s'*100), lambda: maf.bcast('s'*100)) compare('bcast(s*1000)', lambda: COMM_WORLD.bcast('s'*1000), lambda: maf.bcast('s'*1000)) compare('bcast(range(1000))', lambda: COMM_WORLD.bcast(range(1000)), lambda: maf.bcast(range(1000))) compare('bcast(range(10000))', lambda: COMM_WORLD.bcast(range(10000)), lambda: maf.bcast(range(10000)))
import pickle import matplotlib.patches import collections import sys from scipy import stats from mpi4py.MPI import COMM_WORLD as CW def flatten(x): if isinstance(x, collections.Iterable): return [a for i in x for a in flatten(i)] else: return [x] rank = CW.Get_rank() size = CW.Get_size() two_col_width = 7.20472 #inches single_col_width = 3.50394 #inches page_height = 10.62472 font_size = 10 sys.stdout.flush() CW.Barrier() pickle_file = sys.argv[1] true_birth_con_pickle = sys.argv[2] plot_gradient = False read_pickle = bool(sys.argv[3]) baseline_yr = float(sys.argv[4])
def reduce(self): """ Compute the result. """ self.result = comm.allreduce(ga.sum(self.data).get())
def batch_reduce(*outs): """ Optimal (compared to self.reduce) when communication cost is latency bound. """ results = comm.allreduce(np.array([ga.sum(out.data).get() for out in outs])) for k in range(len(outs)): outs[k].result = results[k]
def __init__(self, array_or_dtype, x_overlap=0): """ Create a spatial grid on the GPU(s). Input variables array_or_dtype -- can either be a numpy array of the same shape as the global space, or a numpy dtype. If a valid array is passed, it will be loaded on to the GPU. If a dtype is passed, then an array of zeros, of that dtype will be loaded onto the GPU. Optional variables x_overlap -- the number of adjacent cells in either the negative or positive x-direction that need to simultaneously be accessed along with the current cell. Must be a non-negative integer. Default value is 0. """ shape = get_space_info()['shape'] # Get the shape of the space. xr = get_space_info()['x_range'] # Get the local x_range. all_x_ranges = get_space_info()['all_x_ranges'] # Get the local x_range. local_shape = (xr[1]-xr[0], shape[1], shape[2]) self._set_gce_type('grid') # Set the gce type to grid. # Make sure overlap option is valid. if type(x_overlap) is not int: raise TypeError('x_overlap must be an integer.') elif x_overlap < 0: raise TypeError('x_overlap must be a non-negative integer.') if comm.rank == 0: # Process the array_or_dtype input variable. if type(array_or_dtype) is np.ndarray: # Input is an array. array = array_or_dtype # Make sure the array is of the correct shape. if array.shape != shape: raise TypeError('Shape of array does not match shape of space.') # Make sure the array is of a valid datatype. self._get_dtype(array.dtype.type) elif type(array_or_dtype) is type: # Input is a datatype. self._get_dtype(array_or_dtype) # Validate the dtype. array = np.zeros(shape, dtype=self.dtype) # Make a zeros array. else: # Invalid input. raise TypeError('Input variable must be a numpy array or dtype') # Prepare array to be scattered. array = [array[r[0]:r[1],:,:] for r in all_x_ranges] else: array = None array = comm.scatter(array) self._get_dtype(array.dtype.type) # # Narrow down the array to local x_range. # array = array[xr[0]:xr[1],:,:] # Add padding to array, if needed. self._xlap = x_overlap if self._xlap is not 0: padding = np.empty((self._xlap,) + shape[1:3], dtype=array.dtype) array = np.concatenate((padding, array, padding), axis=0) self.to_gpu(array) # Load onto device. # Determine information needed for synchronization. if self._xlap is not 0: # Calculates the pointer to the x offset in a grid. ptr_dx = lambda x_pos: self.data.ptr + self.data.dtype.itemsize * \ x_pos * shape[1] * shape[2] # Pointers to different sections of the grid that are relevant # for synchronization. self._sync_ptrs = { 'forw_src': ptr_dx(xr[1]-xr[0]), \ 'back_dest': ptr_dx(0), \ 'back_src': ptr_dx(self._xlap), \ 'forw_dest': ptr_dx(xr[1]-xr[0] + self._xlap)} # Buffers used during synchronization. self._sync_buffers = [drv.pagelocked_empty( \ (self._xlap, shape[1], shape[2]), \ self.dtype) for k in range(4)] # Streams used during synchronization. self._sync_streams = [drv.Stream() for k in range(4)] # Used to identify neighboring MPI nodes with whom to synchronize. self._sync_adj = get_space_info()['mpi_adj'] # Offset in bytes to the true start of the grid. # This is used to "hide" overlap areas from the kernel. self._xlap_offset = self.data.dtype.itemsize * \ self._xlap * shape[1] * shape[2] self.synchronize() # Synchronize the grid. comm.Barrier() # Wait for all grids to synchronize before proceeding.
def __del__(self): if self.rank == 0: for p in range(1, self.P): COMM_WORLD.isend(False, dest=p)
def get_parameters(name): """ Reads the simulation parameters from the input hdf5 file. """ if comm.rank == 0: f = h5py.File(name + '.grid', 'r') files_to_delete = [name + '.grid'] omega = np.complex128(f['omega_r'][0] + 1j * f['omega_i'][0]) shape = tuple([int(s) for s in f['shape'][:]]) # bound_conds = f['bound_conds'][:] # Function used to read in a 1D complex vector fields. get_1D_fields = lambda a: [(f[a+'_'+u+'r'][:] + 1j * f[a+'_'+u+'i'][:]).\ astype(np.complex128) for u in 'xyz'] # Read in s and t vectors. s = get_1D_fields('sp') t = get_1D_fields('sd') # Read in max_iters and err_thresh. max_iters = int(f['max_iters'][0]) # max_iters = 100 err_thresh = float(f['err_thresh'][0]) f.close() # Close file. # Function used to read in 3D complex vector fields. def get_3D_fields(a): field = [] for k in range(3): key = name + '.' + a + '_' + 'xyz'[k] field.append((h5py.File(key + 'r')['data'][:] + \ 1j * h5py.File(key + 'i')['data'][:]).astype(np.complex128)) files_to_delete.append(key + 'r') files_to_delete.append(key + 'i') return field # # Read in m, e, and j fields. # for name in 'eJmE': # print comm.rank, name # params[name] = get_3D_fields(name) e = get_3D_fields('e') j = get_3D_fields('J') m = get_3D_fields('m') x = get_3D_fields('E') # Delete input files. for filename in files_to_delete: os.remove(filename) # Do some simple pre-computation. for k in range(3): m[k] = m[k]**-1 e[k] = omega**2 * e[k] j[k] = -1j * omega * j[k] params = {'omega': omega, 'shape': shape, \ 'max_iters': max_iters, 'err_thresh': err_thresh, \ 's': s, 't': t} # 'e': e, 'm': m, 'j': j, 'x': x} else: params = None params = comm.bcast(params) if comm.rank == 0: params['e'] = e params['m'] = m params['j'] = j params['x'] = x else: for field_name in 'emjx': params[field_name] = [None] * 3 return params