def createnc(comm,casename,vname,periods,units,calendar,fields,nx,ny,ntime,nz=None): from mpi4py import MPI filename = "%s_%s_%s.nc"%(casename,vname,periods) rootgrp = Dataset(filename , "w" , parallel=True, comm=comm, info=MPI.Info()) rootgrp.createDimension("west_east" , nx ) rootgrp.createDimension("south_north" , ny ) #rootgrp.createDimension("time" , None) rootgrp.createDimension("time" , ntime) T = rootgrp.createVariable("time" , "f4" , ("time" , )) T.units = units T.calendar = calendar if nz: rootgrp.createDimension("bottom_top" , nz ) nc_dim=("time","bottom_top","south_north","west_east",) else: nc_dim=("time","south_north","west_east",) for field in fields: rootgrp.createVariable(field,"f4",nc_dim) return rootgrp
# to run: mpirun -np 4 python mpi_example.py from mpi4py import MPI import numpy as np from netCDF4 import Dataset rank = MPI.COMM_WORLD.rank # The process ID (integer 0-3 for 4-process run) nc = Dataset('parallel_test.nc', 'w', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info(), format='NETCDF4_CLASSIC') # below should work also - MPI_COMM_WORLD and MPI_INFO_NULL will be used. #nc = Dataset('parallel_test.nc', 'w', parallel=True) d = nc.createDimension('dim', 4) v = nc.createVariable('var', np.int, 'dim') v[rank] = rank # switch to collective mode, rewrite the data. v.set_collective(True) v[rank] = rank nc.close() # reopen the file read-only, check the data nc = Dataset('parallel_test.nc', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) assert rank == nc['var'][rank] nc.close() # reopen the file in append mode, modify the data on the last rank. nc = Dataset('parallel_test.nc', 'a', parallel=True,
# to run: mpirun -np 4 python mpi_example.py import sys from mpi4py import MPI import numpy as np from netCDF4 import Dataset rank = MPI.COMM_WORLD.rank # The process ID (integer 0-3 for 4-process run) testfile = "simple_xy_par.nc" nc = Dataset(testfile, parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) if rank == 0: # obviously rank 0 can see the whole data set # whis is interesting but unnecessary and not scalable since # the data might not fit into the RAM of a single node for k, v in nc.variables.items(): print("Variablename={0}".format(k)) print("Value\n") print(v) print("Value as numpy.array \n") print(np.array(v)) # The aim of the exercise is actually to read only the part relevant # for the computation the actual rank is responsible for nc.close()
# based on https://github.com/Unidata/netcdf4-python/blob/master/examples/mpi_example.py # to run: mpirun -np 4 python mpi_example.py from mpi4py import MPI import numpy as np from netCDF4 import Dataset rank = MPI.COMM_WORLD.rank # The process ID (integer 0-3 for 4-process run) if rank == 0: print('Creating file with format {}'.format(format)) nc = Dataset('parallel_test.nc', 'w', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info(), format='NETCDF4_CLASSIC') d = nc.createDimension('dim', 4) v = nc.createVariable('var', np.int32, 'dim') v[rank] = rank # switch to collective mode, rewrite the data. v.set_collective(True) v[rank] = rank nc.close() # reopen the file read-only, check the data nc = Dataset('parallel_test.nc', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) assert rank == nc['var'][rank] nc.close() # reopen the file in append mode, modify the data on the last rank. nc = Dataset('parallel_test.nc', 'a', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) if rank == 3: v[rank] = 2 * rank nc.close()
# to run: mpirun -np 4 python mpi_example.py import sys from mpi4py import MPI import numpy as np from netCDF4 import Dataset if len(sys.argv) == 2: format = sys.argv[1] else: format = 'NETCDF4_CLASSIC' rank = MPI.COMM_WORLD.rank # The process ID (integer 0-3 for 4-process run) if rank == 0: print('Creating file with format {}'.format(format)) nc = Dataset('parallel_test.nc', 'w', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info(),format=format) # below should work also - MPI_COMM_WORLD and MPI_INFO_NULL will be used. #nc = Dataset('parallel_test.nc', 'w', parallel=True) d = nc.createDimension('dim',4) v = nc.createVariable('var', np.int32, 'dim') v[rank] = rank # switch to collective mode, rewrite the data. v.set_collective(True) v[rank] = rank nc.close() # reopen the file read-only, check the data nc = Dataset('parallel_test.nc', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) assert rank==nc['var'][rank] nc.close() # reopen the file in append mode, modify the data on the last rank. nc = Dataset('parallel_test.nc', 'a',parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info())
def ncfilter( infile = 'default.nc', outfile = 'filtered.nc', ndx_file = "groups.ndx", g_sel = "indenter", selvarname = 'id', # select subset based on NetCDF variable 'id' seldimname = "atom", # dimension to resize pardimname = "frame", # dimension for chunk-wise parallelization format = 'NETCDF4', collective = True, # NetCDF MPIIO mode: collective or independent loglvl = standard_loglevel, logfmt = standard_logformat, logout = None ): """Filters NetCDF trajectory by atom indices from GROMACS sytle .ndx file. Args: infile (str, optional): NetCDF trajectory, defaults to 'default.nc'. outfile (str, optional): NetCDF trajectory, defaults to 'filtered.nc'. ndx_file (str, optional): GROMACS style .ndx file, defaults to 'groups.ndx'. g_sel (str, optional): Name of group to select, as found in .ndx file, defaults to 'indenter'. selvarname (str, optional): Name of NetCDF variable to compare against indices in .ndx file, defaults to 'id'. seldimname (str, optional): Name of NetCDF dimension to reduce. Defaults to 'atom'. pardimname (str, optional): Name of NetCDF dimension to chop into chunks for parallel processing via MPI. Defaults to 'frame'. format (str, optional): NetCDF format, defaults to 'NETCDF4'. collective (bool, optional):Parallel IO mode, defaults to 'True'. Independent mode is used and unlimited are written as finite dimensions if 'False'. Please refert to http://unidata.github.io/netcdf4-python/netCDF4/index.html#section13 loglvl (int, optional): Log level, defaults to 'logging.ERROR'. logfmt (str, optional): Override default log format. logout (str, optional): Name of log file. Defaults to 'None', in which case the log streams to the terminal. Returns: Nothing. """ # MPI communicator comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() # set up logging to terminal (and to file) # use same format for file and stream logger = logging.getLogger("rank[{:03d}]:{}".format(rank,__name__)) logger.setLevel(loglvl) logformatter = logging.Formatter(logfmt) ch = logging.StreamHandler() ch.setFormatter(logformatter) ch.setLevel(loglvl) logger.addHandler(ch) if isinstance(logout,str): logger.setLevel(logging.DEBUG) fh = MPIFileHandler(logout) fh.setFormatter(logformatter) fh.setLevel(logging.DEBUG) # always verbose to log file logger.addHandler(fh) logger.debug('Hello from rank {}/{}.'.format(rank, size)) # read groups from .ndx file: ndx = NDX() if rank == 0: logger.info( "Looking for '{:s}' in current working directory '{:s}'...".format( ndx_file, os.getcwd()) ) ndx.read(ndx_file) if rank == 0: logger.info("Read {:d} groups from '{:s}':".format(len(ndx),ndx_file)) for g in ndx.keys(): logger.info("{:48s} - {: 24d} atoms".format(g,len(ndx[g]))) # https://unidata.github.io/netcdf4-python/netCDF4/index.html#section15 You # can create netCDF Datasets whose content is held in memory instead of in a # disk file. There are two ways to do this. If you don't need access to the # memory buffer containing the Dataset from within python, the best way is to # use the diskless=True keyword argument when creating the Dataset. If you # want to save the Dataset to disk when you close it, also set persist=True. with Dataset(infile, 'r', parallel=True, comm=comm, info=MPI.Info(), format=format) as ncin, \ Dataset(outfile, 'w', parallel=True, comm=comm, info=MPI.Info(), format=format, diskless=True, persist=True) as ncout: # get atom ids from netcdf, assume unsorted selvar = ncin.variables[selvarname] # look for parallelization dimension in selection variable: selvardnames = np.array(selvar.dimensions,dtype=str) selvarndims = len(selvardnames) # number of dimensions in selvar # select everything in selection variable to begin with selvarsel = [slice(None)]*selvarndims # check for parallelization along pardimname if rank == 0: logger.debug( "Parallelization dimension '{}' in selection variable {}{} ?".format( pardimname,selvarname,selvardnames)) selvarpardimpos = np.argwhere(selvardnames == pardimname).flatten() if rank == 0: logger.debug("np.argwhere returned {}".format(selvarpardimpos)) # in the standard case, selection bases on variable 'id' with dimensions # ('frame','atom'), where processing ist parallelized chunk-wise over # 'frame', while only a subset is processed over 'atom'. if len(selvarpardimpos) != 1: if rank == 0: logger.error( "Selection variable {}{} has no parallelization dimension {}!".format( selvarname, selvar.shape, pardimname) ) raise ValueError() else: selvarpardimpos = selvarpardimpos[0] if rank == 0: logger.info(' '.join(("Selection variable {}{} has parallelization", "dimension {} at position {}")).format( selvarname, selvar.shape, pardimname, selvarpardimpos)) # create a reference selection based on the first index in pardim # to assure the total numbert of particles not to change along pardim selvarsel[selvarpardimpos] = 0 # TODO: split ndx-specific functionality from this function # with the ndx object here, selvar is expected to be only 1d after reduction # by parallelization dimension: refsel = np.isin(selvar[selvarpardimpos], ndx[g_sel]) refnumsel = np.count_nonzero(refsel) # number of selected entries along selection dimension if rank == 0: logger.info(' '.join(("Selection variable {}{} has {} entries", "along selection dimension {} in selection '{}'")).format( selvarname, selvarsel, refnumsel, seldimname, g_sel ) ) # discretize trajectory along pardim and process chunk-wise pardimlen = len(ncin.dimensions[pardimname]) if rank == 0: logger.info('Parallelization dimension: {}[{}]'.format( pardimname, pardimlen)) if pardimlen < size: # more ranks than pardimlen needed! logger.error(' '.join(('Number of ranks {} exceeds size of', 'parallelization dimension: {}[{}]. Reduce!')).format( size, pardimname, pardimlen)) raise ValueError() n1 = rank*(pardimlen//size) n2 = (rank+1)*(pardimlen//size) logger.info('Parallelization dimension: {}[{}]'.format( pardimname, pardimlen)) # TODO: this intended treatment of pardimlen < size does not work # treatment for special case where pardimlen < size # if n1 == 0 and n2 == 0: # if rank == 0: # logger.warn(' '.join(('Number of ranks {} exceeds size of', # 'parallelization dimension: {}[{}]')).format( # size, pardimname, pardimlen)) # # treatment for special case where rank >= size # if rank < pardimlen: # n1 = rank # n2 = rank+1 # else: # logger.warn(' '.join(('Apparently, rank {}/{} exceeds parallelization', # 'dimension {}[{}] and will thus be idle.')).format( # rank, size, pardimname, pardimlen)) # n1 = 0 # n2 = 0 # elif rank == size-1: # treatment for last rank if pardimlen >= size # n2 = pardimlen if rank == size-1: # treatment for last rank if pardimlen >= size n2 = pardimlen logger.info( 'Rank {}/{} treats parallelization dimension slice {}[{}:{}]'.format( rank, size, pardimname, n1, n2)) # create global selection along all dimensions in selection variable # selection_shape = [ len(dim) for dim in selvar.get_dims() ] selection = np.zeros(selvar.shape, dtype=bool) # TODO: split ndx-specific functionality from this function for i in range(n1, n2): # select only selvar values that are within the specified index group selvarsel[selvarpardimpos] = i selvari = selvar[selvarsel] selection[selvarsel] = np.isin(selvari, ndx[g_sel]) numsel = np.count_nonzero(selection[selvarsel]) logger.debug(' '.join(("Selection variable {}{} has {} entries along", "selection dimension {} in selection '{}'")).format( selvarname, selvarsel, numsel, seldimname, g_sel )) if numsel != refnumsel: logger.error(' '.join(("Selection variable {}{} has {} entries along", "selection dimension {} in selection '{}', differing from {}", "in reference selection!")).format( selvarname, selvarsel, numsel, seldimname, g_sel, refnumsel )) raise ValueError() elif np.any( np.not_equal( selection[i], refsel ) ): logger.warn(' '.join(("Selection variable {}{} and reference selection", "both have {} entries along selection dimension {}", "in selection '{}', but in differing order.")).format( selvarname, selvarsel, numsel, seldimname, g_sel )) # copy attributes, src: # https://stackoverflow.com/questions/15141563/ # python-netcdf-making-a-copy-of-all-variables-and-attributes-but-one if rank == 0: logger.info("Copying global attributes...") for aname in ncin.ncattrs(): if rank == 0: logger.info("Copy attibute {}".format(aname)) ncout.setncattr(aname, ncin.getncattr(aname)) # copy dimensions, src: https://gist.github.com/guziy/8543562 if rank == 0: logger.info("Copying dimensions...") for dname, dim in ncin.dimensions.items(): if dname == seldimname: lendim = refnumsel # reduce size along filter dimension if rank == 0: logger.info("Shrink dimension {}[{}] to [{}]".format( dname, len(dim), lendim)) else: if collective: lendim = len(dim) if not dim.isunlimited() else None else: lendim = len(dim) # ignore unlimited dimension in this case if rank == 0: logger.info("Copy dimension {}[{}] to {}[{}]".format( dname, len(dim), dname, lendim)) ncout.createDimension(dname, lendim) # copy variables if rank == 0: logger.info("Copying variables...") for vname, inVar in ncin.variables.items(): logger.debug( "Going to copy innput variable {}{}: {}. Rank {}/{} still alive".format( vname, inVar.shape, inVar.datatype, rank, size)) outVar = ncout.createVariable(vname, inVar.datatype, inVar.dimensions) if rank == 0: logger.info( "Copy input variable {}{}: {} to output variable {}{}: {}".format( vname, inVar.shape, inVar.datatype, vname, outVar.shape, outVar.datatype)) if collective: outVar.set_collective(True) # output in collective mode else: outVar.set_collective(False) # output in independent mode # Copy variable attributes outVar.setncatts({k: inVar.getncattr(k) for k in inVar.ncattrs()}) # look at dimensions in variable dnames = np.array(inVar.dimensions,dtype=str) ndims = len(dnames) # number of dimensions in current variable # select everything in variable to begin with in_selection = [slice(None)]*ndims out_selection = [slice(None)]*ndims # check for selection dimension: if rank == 0: logger.debug(' '.join(("Is the selection dimension '{}'", "in dimensions {} of variable {}{}?")).format( seldimname, dnames, vname, inVar.shape)) seldimpos = np.argwhere(dnames == seldimname).flatten() if rank == 0: logger.debug("np.argwhere(...) returned '{}'".format(seldimpos)) if len(seldimpos) != 1: seldimpos = None if rank == 0: logger.info( "Input variable {}{} has no selection dimension {}.".format( vname, inVar.shape, seldimname)) else: seldimpos = seldimpos[0] if rank == 0: logger.info(' '.join(("Input variable {}{} has selection", "dimension {} at position {}")).format( vname, inVar.shape, seldimname, seldimpos)) # check for parallelization dimension: if rank == 0: logger.debug(' '.join(("Is the parallelization dimension '{}'", "in dimensions {} of variable {}{}?")).format( pardimname, dnames, vname, inVar.shape)) pardimpos = np.argwhere(dnames == pardimname).flatten() if rank == 0: logger.debug("np.argwhere(...) returned {}".format(pardimpos)) # no parallelization dimension in current inVar: if len(pardimpos) != 1: if rank == 0: logger.info( "Input variable {}{} has no parallelization dimension {}.".format( vname, inVar.shape, pardimname)) # in the current implementation, the selection dimension is expected # only to occur in variables that have parallelization dimension as well if (seldimpos is not None) and (rank == 0): logger.error(' '.joint(( "Input variable {}{} has selection dimension {}," "but no parallelization dimension {}!")).format( vname, inVar.shape, seldimname, pardimname)) if rank == 0: logger.info("Copy input variable {}{} as is at once.".format( vname, inVar.shape)) outVar[:] = inVar[:] # parallelization dimension found in current inVar: else: pardimpos = pardimpos[0] if rank == 0: logger.info(' '.join(( "Input variable {}{} has parallelization dimension {}", "at position {}")).format( vname, inVar.shape, pardimname, pardimpos)) if collective: # issue: outVar.shape has zero entry for unlimited dim if rank == 0: logger.info("outVar {}{} has {} unlimited dims".format( vname, outVar.shape, np.equal( outVar.shape, 0) )) tmpVar_shape = np.where( np.equal( outVar.shape, 0 ), inVar.shape, outVar.shape) tmpVar = np.zeros( tmpVar_shape, dtype=outVar.dtype ) else: tmpVar = np.zeros( outVar.shape, dtype=outVar.dtype ) if rank == 0: logger.debug("np.array tmpVar{}: {} created".format( tmpVar.shape, tmpVar.dtype)) # https://unidata.github.io/netcdf4-python/netCDF4/index.html#section6 # Boolean array and integer sequence indexing behaves differently for # netCDF variables than for numpy arrays. Only 1-d boolean arrays and # integer sequences are allowed, and these indices work independently # along each dimension (similar to the way vector subscripts work in # fortran). # Here, we iterate subsequently over temporal axis as selection may # vary for every frame and multidimensional selection arrays are no # option. for i in range(n1, n2): if seldimpos is not None: selvarsel[selvarpardimpos] = i in_selection[seldimpos] = selection[selvarsel] in_selection[pardimpos] = i out_selection[pardimpos] = i logger.debug("Assigning input variable {}{} to tmpVar{}".format( vname, in_selection, out_selection)) tmpVar[out_selection] = inVar[in_selection] logger.debug("Assigned input variable {}{} to tmpVar{}".format( vname, in_selection, out_selection)) # TODO: # this commented conditional clause intends to alleviate issue when # number of ranks exceeds number of prallelizable entried in par dim. # if n2 > n1: # only if there is a finite range of elements to assign: out_selection[pardimpos] = slice(n1,n2) logger.info(' '.join(("Assigning filtered input variable {}{}", "to output var {}{}")).format( vname, out_selection, vname, out_selection)) # apparently, accumulating the desired subset index by index in an numpy # array tmpVar and then assigning the accumulated chunk to the NetCDF # variable outVar works better than assigning subset to NetCDF variable # index by index directly outVar[out_selection] = tmpVar[out_selection] logger.info(' '.join(("Assigned filtered input variable {}{}", "to output var {}{}")).format( vname, out_selection, vname, out_selection)) # else: # logger.info("Skipped input variable {} on rank {}/{}.".format( # vname, rank, size)) # nc.close() not necessary due to 'with' statement logger.debug('Goodbye from rank {}/{}.'.format(rank, size))
outputinterval = 86400. / 4. # interval between frames in seconds tmin = 0. * 86400. # time to start saving data (in days) tmax = 50. * 86400. # time to stop (in days) nsteps = int(tmax / outputinterval) # number of time steps to animate # set number of timesteps to integrate for each call to model.advance model.timesteps = int(outputinterval / model.dt) savedata = 'sqg_N%s_6hrly_ens.nc' % N # save data to netcdf file if savedata is not None: from netCDF4 import Dataset nc = Dataset(savedata, mode='w', format='NETCDF4_CLASSIC', parallel=True, comm=MPI.COMM_WORLD, info=MPI.Info()) nc.r = model.r nc.f = model.f nc.U = model.U nc.L = model.L nc.H = model.H nc.g = g nc.theta0 = theta0 nc.nsq = model.nsq nc.tdiab = model.tdiab nc.dt = model.dt nc.diff_efold = model.diff_efold nc.diff_order = model.diff_order nc.symmetric = int(model.symmetric) nc.dealias = int(model.dealias) x = nc.createDimension('x', N)