def testPartitionInt(self): data = 13 + self.rank sresult = self.scomm.partition(data, func=Duplicate()) presult = self.pcomm.partition(data, func=Duplicate()) msg = test_info_msg('partition(int)', data, sresult, presult) print(msg) self.assertEqual(sresult, presult, msg)
def testPartitionIntInvolved(self): data = 13 + self.rank sresult = self.scomm.partition(data, func=Duplicate(), involved=True) presult = self.pcomm.partition(data, func=Duplicate(), involved=True) msg = test_info_msg('partition(int, T)', data, sresult, presult) print msg self.assertEqual(sresult, presult, msg)
def testMultiPartitionIntInvolved(self): data = self.grank actual = self.multicomm.partition(data, func=Duplicate(), involved=True) expected = self.rank * len(self.groups) self.assertEqual(actual, expected)
def testMonoPartitionIntInvolved(self): data = self.grank actual = self.monocomm.partition(data, func=Duplicate(), involved=True) expected = self.color # By chance! msg = test_info_msg(self.grank, self.gsize, 'mono.partition(int,T)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testPartitionIntInvolved(self): if self.gcomm.is_manager(): data = 10 else: data = None actual = self.gcomm.partition(data, func=Duplicate(), involved=True) expected = 10 self.assertEqual(actual, expected)
def testMultiPartitionInt(self): data = self.grank actual = self.multicomm.partition(data, func=Duplicate()) if self.multicomm.is_manager(): expected = None else: expected = self.rank * len(self.groups) self.assertEqual(actual, expected)
def testMonoPartitionInt(self): data = self.grank actual = self.monocomm.partition(data, func=Duplicate()) if self.monocomm.is_manager(): expected = None else: expected = self.color # By chance! self.assertEqual(actual, expected)
def testTreeScatterInt(self): if self.gcomm.is_manager(): data = 10 else: data = None if self.monocomm.is_manager(): mydata = self.multicomm.partition(data, func=Duplicate(), involved=True) else: mydata = None actual = self.monocomm.partition(mydata, func=Duplicate(), involved=True) expected = 10 self.assertEqual(actual, expected)
def testMultiPartitionIntInvolved(self): data = self.grank actual = self.multicomm.partition(data, func=Duplicate(), involved=True) expected = self.rank * len(self.groups) msg = test_info_msg(self.grank, self.gsize, 'multi.partition(int,T)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testPartitionInt(self): if self.gcomm.is_manager(): data = 10 else: data = None actual = self.gcomm.partition(data, func=Duplicate()) if self.gcomm.is_manager(): expected = None else: expected = 10 self.assertEqual(actual, expected)
def testPartitionIntInvolved(self): if self.gcomm.is_manager(): data = 10 else: data = None actual = self.gcomm.partition(data, func=Duplicate(), involved=True) expected = 10 msg = test_info_msg(self.rank, self.size, 'partition(int, T)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testMonoPartitionInt(self): data = self.grank actual = self.monocomm.partition(data, func=Duplicate()) if self.monocomm.is_manager(): expected = None else: expected = self.color # By chance! msg = test_info_msg(self.grank, self.gsize, 'mono.partition(int)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testMultiPartitionInt(self): data = self.grank actual = self.multicomm.partition(data, func=Duplicate()) if self.multicomm.is_manager(): expected = None else: expected = self.rank * len(self.groups) msg = test_info_msg(self.grank, self.gsize, 'multi.partition(int)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testTreeScatterInt(self): if self.gcomm.is_manager(): data = 10 else: data = None if self.monocomm.is_manager(): mydata = self.multicomm.partition(data, func=Duplicate(), involved=True) else: mydata = None actual = self.monocomm.partition(mydata, func=Duplicate(), involved=True) expected = 10 msg = test_info_msg(self.grank, self.gsize, 'TreeScatter(int)', data, actual, expected) print msg self.assertEqual(actual, expected, msg)
def testPartitionInt(self): if self.gcomm.is_manager(): data = 10 else: data = None actual = self.gcomm.partition(data, func=Duplicate()) if self.gcomm.is_manager(): expected = None else: expected = 10 msg = test_info_msg(self.rank, self.size, 'partition(int)', data, actual, expected) print(msg) self.assertEqual(actual, expected, msg)
def main(argv): print('Running pyEnsSum!') # Get command line stuff and store in a dictionary s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm gmonly popens cumul regx= startMon= endMon= fIndex=' optkeys = s.split() try: opts, args = getopt.getopt(argv, "h", optkeys) except getopt.GetoptError: pyEnsLib.EnsSum_usage() sys.exit(2) # Put command line options in a dictionary - also set defaults opts_dict={} # Defaults opts_dict['tag'] = '' opts_dict['compset'] = '' opts_dict['mach'] = '' opts_dict['esize'] = 151 opts_dict['tslice'] = 0 opts_dict['res'] = '' opts_dict['sumfile'] = 'ens.summary.nc' opts_dict['indir'] = './' opts_dict['sumfiledir'] = './' opts_dict['jsonfile'] = '' opts_dict['verbose'] = True opts_dict['mpi_enable'] = False opts_dict['maxnorm'] = False opts_dict['gmonly'] = False opts_dict['popens'] = False opts_dict['cumul'] = False opts_dict['regx'] = 'test' opts_dict['startMon'] = 1 opts_dict['endMon'] = 1 opts_dict['fIndex'] = 151 # This creates the dictionary of input arguments opts_dict = pyEnsLib.getopt_parseconfig(opts,optkeys,'ES',opts_dict) verbose = opts_dict['verbose'] st = opts_dict['esize'] esize = int(st) if (verbose == True): print(opts_dict) print('Ensemble size for summary = ', esize) if not (opts_dict['tag'] and opts_dict['compset'] and opts_dict['mach'] or opts_dict['res']): print('Please specify --tag, --compset, --mach and --res options') sys.exit() # Now find file names in indir input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist=[] # Create a mpi simplecomm object if opts_dict['mpi_enable']: me=simplecomm.create_comm() else: me=simplecomm.create_comm(not opts_dict['mpi_enable']) if me.get_rank() == 0: if opts_dict['jsonfile']: # Read in the excluded var list ex_varlist=pyEnsLib.read_jsonlist(opts_dict['jsonfile'],'ES') # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: ex_varlist=me.partition(ex_varlist,func=Duplicate(),involved=True) in_files=[] if(os.path.exists(input_dir)): # Get the list of files in_files_temp = os.listdir(input_dir) in_files=sorted(in_files_temp) #print in_files # Make sure we have enough num_files = len(in_files) if (verbose == True): print('Number of files in input directory = ', num_files) if (num_files < esize): print('Number of files in input directory (',num_files, ') is less than specified ensemble size of ', esize) sys.exit(2) if (num_files > esize): print('NOTE: Number of files in ', input_dir, 'is greater than specified ensemble size of ', esize, '\nwill just use the first ', esize, 'files') else: print('Input directory: ',input_dir,' not found') sys.exit(2) if opts_dict['cumul']: if opts_dict['regx']: in_files_list=get_cumul_filelist(opts_dict,opts_dict['indir'],opts_dict['regx']) in_files=me.partition(in_files_list,func=EqualLength(),involved=True) if me.get_rank()==0: print('in_files=',in_files) # Open the files in the input directory o_files=[] for onefile in in_files[0:esize]: if (os.path.isfile(input_dir+'/' + onefile)): o_files.append(Nio.open_file(input_dir+'/' + onefile,"r")) else: print("COULD NOT LOCATE FILE "+ input_dir + onefile + "! EXITING....") sys.exit() # Store dimensions of the input fields if (verbose == True): print("Getting spatial dimensions") nlev = -1 ncol = -1 nlat = -1 nlon = -1 lonkey='' latkey='' # Look at first file and get dims input_dims = o_files[0].dimensions ndims = len(input_dims) for key in input_dims: if key == "lev": nlev = input_dims["lev"] elif key == "ncol": ncol = input_dims["ncol"] elif (key == "nlon") or (key =="lon"): nlon = input_dims[key] lonkey=key elif (key == "nlat") or (key == "lat"): nlat = input_dims[key] latkey=key if (nlev == -1) : print("COULD NOT LOCATE valid dimension lev => EXITING....") sys.exit() if (( ncol == -1) and ((nlat == -1) or (nlon == -1))): print("Need either lat/lon or ncol => EXITING....") sys.exit() # Check if this is SE or FV data if (ncol != -1): is_SE = True else: is_SE = False # Make sure all files have the same dimensions if (verbose == True): print("Checking dimensions across files....") print('lev = ', nlev) if (is_SE == True): print('ncol = ', ncol) else: print('nlat = ', nlat) print('nlon = ', nlon) for count, this_file in enumerate(o_files): input_dims = this_file.dimensions if (is_SE == True): if ( nlev != int(input_dims["lev"]) or ( ncol != int(input_dims["ncol"]))): print("Dimension mismatch between ", in_files[0], 'and', in_files[0], '!!!') sys.exit() else: if ( nlev != int(input_dims["lev"]) or ( nlat != int(input_dims[latkey]))\ or ( nlon != int(input_dims[lonkey]))): print("Dimension mismatch between ", in_files[0], 'and', in_files[0], '!!!') sys.exit() # Get 2d vars, 3d vars and all vars (For now include all variables) vars_dict = o_files[0].variables # Remove the excluded variables (specified in json file) from variable dictionary if ex_varlist: for i in ex_varlist: if i in vars_dict: del vars_dict[i] num_vars = len(vars_dict) if (verbose == True): print('Number of variables (including metadata) found = ', num_vars) str_size = 0 d2_var_names = [] d3_var_names = [] num_2d = 0 num_3d = 0 # Which are 2d, which are 3d and max str_size for k,v in vars_dict.iteritems(): var = k vd = v.dimensions # all the variable's dimensions (names) vr = v.rank # num dimension vs = v.shape # dim values is_2d = False is_3d = False if (is_SE == True): # (time, lev, ncol) or (time, ncol) if ((vr == 2) and (vs[1] == ncol)): is_2d = True num_2d += 1 elif ((vr == 3) and (vs[2] == ncol and vs[1] == nlev )): is_3d = True num_3d += 1 else: # (time, lev, nlon, nlon) or (time, nlat, nlon) if ((vr == 3) and (vs[1] == nlat and vs[2] == nlon)): is_2d = True num_2d += 1 elif ((vr == 4) and (vs[2] == nlat and vs[3] == nlon and vs[1] == nlev )): is_3d = True num_3d += 1 if (is_3d == True) : str_size = max(str_size, len(k)) d3_var_names.append(k) elif (is_2d == True): str_size = max(str_size, len(k)) d2_var_names.append(k) # Now sort these and combine (this sorts caps first, then lower case - # which is what we want) d2_var_names.sort() d3_var_names.sort() # All vars is 3d vars first (sorted), the 2d vars all_var_names = list(d3_var_names) all_var_names += d2_var_names n_all_var_names = len(all_var_names) if (verbose == True): print('num vars = ', n_all_var_names, '(3d = ', num_3d, ' and 2d = ', num_2d, ")") # Create new summary ensemble file this_sumfile = opts_dict["sumfile"] if (verbose == True): print("Creating ", this_sumfile, " ...") if(me.get_rank() ==0 | opts_dict["popens"]): if os.path.exists(this_sumfile): os.unlink(this_sumfile) opt = Nio.options() opt.PreFill = False opt.Format = 'NetCDF4Classic' nc_sumfile = Nio.open_file(this_sumfile, 'w', options=opt) # Set dimensions if (verbose == True): print("Setting dimensions .....") if (is_SE == True): nc_sumfile.create_dimension('ncol', ncol) else: nc_sumfile.create_dimension('nlat', nlat) nc_sumfile.create_dimension('nlon', nlon) nc_sumfile.create_dimension('nlev', nlev) nc_sumfile.create_dimension('ens_size', esize) nc_sumfile.create_dimension('nvars', num_3d + num_2d) nc_sumfile.create_dimension('nvars3d', num_3d) nc_sumfile.create_dimension('nvars2d', num_2d) nc_sumfile.create_dimension('str_size', str_size) # Set global attributes now = time.strftime("%c") if (verbose == True): print("Setting global attributes .....") setattr(nc_sumfile, 'creation_date',now) setattr(nc_sumfile, 'title', 'CAM verification ensemble summary file') setattr(nc_sumfile, 'tag', opts_dict["tag"]) setattr(nc_sumfile, 'compset', opts_dict["compset"]) setattr(nc_sumfile, 'resolution', opts_dict["res"]) setattr(nc_sumfile, 'machine', opts_dict["mach"]) # Create variables if (verbose == True): print("Creating variables .....") v_lev = nc_sumfile.create_variable("lev", 'f', ('nlev',)) v_vars = nc_sumfile.create_variable("vars", 'S1', ('nvars', 'str_size')) v_var3d = nc_sumfile.create_variable("var3d", 'S1', ('nvars3d', 'str_size')) v_var2d = nc_sumfile.create_variable("var2d", 'S1', ('nvars2d', 'str_size')) if not opts_dict['gmonly']: if (is_SE == True): v_ens_avg3d = nc_sumfile.create_variable("ens_avg3d", 'f', ('nvars3d', 'nlev', 'ncol')) v_ens_stddev3d = nc_sumfile.create_variable("ens_stddev3d", 'f', ('nvars3d', 'nlev', 'ncol')) v_ens_avg2d = nc_sumfile.create_variable("ens_avg2d", 'f', ('nvars2d', 'ncol')) v_ens_stddev2d = nc_sumfile.create_variable("ens_stddev2d", 'f', ('nvars2d', 'ncol')) else: v_ens_avg3d = nc_sumfile.create_variable("ens_avg3d", 'f', ('nvars3d', 'nlev', 'nlat', 'nlon')) v_ens_stddev3d = nc_sumfile.create_variable("ens_stddev3d", 'f', ('nvars3d', 'nlev', 'nlat', 'nlon')) v_ens_avg2d = nc_sumfile.create_variable("ens_avg2d", 'f', ('nvars2d', 'nlat', 'nlon')) v_ens_stddev2d = nc_sumfile.create_variable("ens_stddev2d", 'f', ('nvars2d', 'nlat', 'nlon')) v_RMSZ = nc_sumfile.create_variable("RMSZ", 'f', ('nvars', 'ens_size')) v_gm = nc_sumfile.create_variable("global_mean", 'f', ('nvars', 'ens_size')) v_loadings_gm = nc_sumfile.create_variable('loadings_gm','f',('nvars','nvars')) v_mu_gm = nc_sumfile.create_variable('mu_gm','f',('nvars',)) v_sigma_gm = nc_sumfile.create_variable('sigma_gm','f',('nvars',)) v_sigma_scores_gm = nc_sumfile.create_variable('sigma_scores_gm','f',('nvars',)) # Assign vars, var3d and var2d if (verbose == True): print("Assigning vars, var3d, and var2d .....") eq_all_var_names =[] eq_d3_var_names = [] eq_d2_var_names = [] l_eq = len(all_var_names) for i in range(l_eq): tt = list(all_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ')*(str_size - l_tt) tt.extend(extra) eq_all_var_names.append(tt) l_eq = len(d3_var_names) for i in range(l_eq): tt = list(d3_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ')*(str_size - l_tt) tt.extend(extra) eq_d3_var_names.append(tt) l_eq = len(d2_var_names) for i in range(l_eq): tt = list(d2_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ')*(str_size - l_tt) tt.extend(extra) eq_d2_var_names.append(tt) v_vars[:] = eq_all_var_names[:] v_var3d[:] = eq_d3_var_names[:] v_var2d[:] = eq_d2_var_names[:] # Time-invarient metadata if (verbose == True): print("Assigning time invariant metadata .....") lev_data = vars_dict["lev"] v_lev = lev_data # Form ensembles, each missing one member; compute RMSZs and global means #for each variable, we also do max norm also (currently done in pyStats) tslice = opts_dict['tslice'] if not opts_dict['cumul']: # Partition the var list var3_list_loc=me.partition(d3_var_names,func=EqualStride(),involved=True) var2_list_loc=me.partition(d2_var_names,func=EqualStride(),involved=True) else: var3_list_loc=d3_var_names var2_list_loc=d2_var_names # Calculate global means # if (verbose == True): print("Calculating global means .....") if not opts_dict['cumul']: gm3d,gm2d = pyEnsLib.generate_global_mean_for_summary(o_files,var3_list_loc,var2_list_loc , is_SE, False,opts_dict) if (verbose == True): print("Finish calculating global means .....") # Calculate RMSZ scores if (verbose == True): print("Calculating RMSZ scores .....") if (not opts_dict['gmonly']) | (opts_dict['cumul']): zscore3d,zscore2d,ens_avg3d,ens_stddev3d,ens_avg2d,ens_stddev2d,temp1,temp2=pyEnsLib.calc_rmsz(o_files,var3_list_loc,var2_list_loc,is_SE,opts_dict) # Calculate max norm ensemble if opts_dict['maxnorm']: if (verbose == True): print("Calculating max norm of ensembles .....") pyEnsLib.calculate_maxnormens(opts_dict,var3_list_loc) pyEnsLib.calculate_maxnormens(opts_dict,var2_list_loc) if opts_dict['mpi_enable'] & ( not opts_dict['popens']): if not opts_dict['cumul']: # Gather the 3d variable results from all processors to the master processor slice_index=get_stride_list(len(d3_var_names),me) # Gather global means 3d results gm3d=gather_npArray(gm3d,me,slice_index,(len(d3_var_names),len(o_files))) if not opts_dict['gmonly']: # Gather zscore3d results zscore3d=gather_npArray(zscore3d,me,slice_index,(len(d3_var_names),len(o_files))) # Gather ens_avg3d and ens_stddev3d results shape_tuple3d=get_shape(ens_avg3d.shape,len(d3_var_names),me.get_rank()) ens_avg3d=gather_npArray(ens_avg3d,me,slice_index,shape_tuple3d) ens_stddev3d=gather_npArray(ens_stddev3d,me,slice_index,shape_tuple3d) # Gather 2d variable results from all processors to the master processor slice_index=get_stride_list(len(d2_var_names),me) # Gather global means 2d results gm2d=gather_npArray(gm2d,me,slice_index,(len(d2_var_names),len(o_files))) if not opts_dict['gmonly']: # Gather zscore2d results zscore2d=gather_npArray(zscore2d,me,slice_index,(len(d2_var_names),len(o_files))) # Gather ens_avg3d and ens_stddev2d results shape_tuple2d=get_shape(ens_avg2d.shape,len(d2_var_names),me.get_rank()) ens_avg2d=gather_npArray(ens_avg2d,me,slice_index,shape_tuple2d) ens_stddev2d=gather_npArray(ens_stddev2d,me,slice_index,shape_tuple2d) else: gmall=np.concatenate((temp1,temp2),axis=0) gmall=pyEnsLib.gather_npArray_pop(gmall,me,(me.get_size(),len(d3_var_names)+len(d2_var_names))) # Assign to file: if me.get_rank() == 0 | opts_dict['popens'] : if not opts_dict['cumul']: gmall=np.concatenate((gm3d,gm2d),axis=0) if not opts_dict['gmonly']: Zscoreall=np.concatenate((zscore3d,zscore2d),axis=0) v_RMSZ[:,:]=Zscoreall[:,:] if not opts_dict['gmonly']: if (is_SE == True): v_ens_avg3d[:,:,:]=ens_avg3d[:,:,:] v_ens_stddev3d[:,:,:]=ens_stddev3d[:,:,:] v_ens_avg2d[:,:]=ens_avg2d[:,:] v_ens_stddev2d[:,:]=ens_stddev2d[:,:] else: v_ens_avg3d[:,:,:,:]=ens_avg3d[:,:,:,:] v_ens_stddev3d[:,:,:,:]=ens_stddev3d[:,:,:,:] v_ens_avg2d[:,:,:]=ens_avg2d[:,:,:] v_ens_stddev2d[:,:,:]=ens_stddev2d[:,:,:] else: gmall_temp=np.transpose(gmall[:,:]) gmall=gmall_temp mu_gm,sigma_gm,standardized_global_mean,loadings_gm,scores_gm=pyEnsLib.pre_PCA(gmall) v_gm[:,:]=gmall[:,:] v_mu_gm[:]=mu_gm[:] v_sigma_gm[:]=sigma_gm[:].astype(np.float32) v_loadings_gm[:,:]=loadings_gm[:,:] v_sigma_scores_gm[:]=scores_gm[:] print("All Done") def get_cumul_filelist(opts_dict,indir,regx): if not opts_dict['indir']: print('input dir is not specified') sys.exit(2) #regx='(pgi(.)*-(01|02))' regx_list=["mon","gnu","pgi"] all_files=[] for prefix in regx_list: for i in range(opts_dict['fIndex'],opts_dict['fIndex']+opts_dict['esize']/3): for j in range(opts_dict['startMon'],opts_dict['endMon']+1): mon_str=str(j).zfill(2) regx='(^'+prefix+'(.)*'+str(i)+'(.)*-('+mon_str+'))' print('regx=',regx) res=[f for f in os.listdir(indir) if re.search(regx,f)] in_files=sorted(res) all_files.extend(in_files) print("all_files=",all_files) #in_files=res return all_files # # Get the shape of all variable list in tuple for all processor # def get_shape(shape_tuple,shape1,rank): lst=list(shape_tuple) lst[0]=shape1 shape_tuple=tuple(lst) return shape_tuple # # Get the mpi partition list for each processor # def get_stride_list(len_of_list,me): slice_index=[] for i in range(me.get_size()): index_arr=np.arange(len_of_list) slice_index.append(index_arr[i::me.get_size()]) return slice_index # # Gather arrays from each processor by the var_list to the master processor and make it an array # def gather_npArray(npArray,me,slice_index,array_shape): the_array=np.zeros(array_shape,dtype=np.float32) if me.get_rank()==0: k=0 for j in slice_index[me.get_rank()]: the_array[j,:]=npArray[k,:] k=k+1 for i in range(1,me.get_size()): if me.get_rank() == 0: rank,npArray=me.collect() k=0 for j in slice_index[rank]: the_array[j,:]=npArray[k,:] k=k+1 if me.get_rank() != 0: message={"from_rank":me.get_rank(),"shape":npArray.shape} me.collect(npArray) me.sync() return the_array if __name__ == "__main__": main(sys.argv[1:])
def testMonoPartitionIntInvolved(self): data = self.grank actual = self.monocomm.partition(data, func=Duplicate(), involved=True) expected = self.color # By chance! self.assertEqual(actual, expected)
def testPartitionIntInvolved(self): data = 13 + self.rank sresult = self.scomm.partition(data, func=Duplicate(), involved=True) presult = self.pcomm.partition(data, func=Duplicate(), involved=True) self.assertEqual(sresult, presult)
def main(argv): # Get command line stuff and store in a dictionary s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm gmonly popens cumul regx= startMon= endMon= fIndex=' optkeys = s.split() try: opts, args = getopt.getopt(argv, "h", optkeys) except getopt.GetoptError: pyEnsLib.EnsSum_usage() sys.exit(2) # Put command line options in a dictionary - also set defaults opts_dict={} # Defaults opts_dict['tag'] = 'cesm2_0_beta10' opts_dict['compset'] = 'F2000climo' opts_dict['mach'] = 'cheyenne' opts_dict['esize'] = 350 opts_dict['tslice'] = 1 opts_dict['res'] = 'f19_f19' opts_dict['sumfile'] = 'ens.summary.nc' opts_dict['indir'] = './' opts_dict['sumfiledir'] = './' opts_dict['jsonfile'] = 'exclude_empty.json' opts_dict['verbose'] = False opts_dict['mpi_enable'] = False opts_dict['maxnorm'] = False opts_dict['gmonly'] = True opts_dict['popens'] = False opts_dict['cumul'] = False opts_dict['regx'] = 'test' opts_dict['startMon'] = 1 opts_dict['endMon'] = 1 opts_dict['fIndex'] = 151 # This creates the dictionary of input arguments opts_dict = pyEnsLib.getopt_parseconfig(opts,optkeys,'ES',opts_dict) verbose = opts_dict['verbose'] st = opts_dict['esize'] esize = int(st) if opts_dict['popens' == True]: print "Error: Please use pyEnsSumPop.py for a POP ensemble (not --popens)." sys.exit() if not (opts_dict['tag'] and opts_dict['compset'] and opts_dict['mach'] or opts_dict['res']): print 'Please specify --tag, --compset, --mach and --res options' sys.exit() # Now find file names in indir input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist=[] inc_varlist=[] # Create a mpi simplecomm object if opts_dict['mpi_enable']: me=simplecomm.create_comm() else: me=simplecomm.create_comm(not opts_dict['mpi_enable']) if me.get_rank() == 0: print 'Running pyEnsSum!' if me.get_rank() ==0 and (verbose == True): print opts_dict print 'Ensemble size for summary = ', esize exclude=False if me.get_rank() == 0: if opts_dict['jsonfile']: inc_varlist=[] # Read in the excluded or included var list ex_varlist,exclude=pyEnsLib.read_jsonlist(opts_dict['jsonfile'],'ES') if exclude == False: inc_varlist=ex_varlist ex_varlist=[] # Read in the included var list #inc_varlist=pyEnsLib.read_jsonlist(opts_dict['jsonfile'],'ES') # Broadcast the excluded var list to each processor #if opts_dict['mpi_enable']: # ex_varlist=me.partition(ex_varlist,func=Duplicate(),involved=True) # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: exclude=me.partition(exclude,func=Duplicate(),involved=True) if exclude: ex_varlist=me.partition(ex_varlist,func=Duplicate(),involved=True) else: inc_varlist=me.partition(inc_varlist,func=Duplicate(),involved=True) in_files=[] if(os.path.exists(input_dir)): # Get the list of files in_files_temp = os.listdir(input_dir) in_files=sorted(in_files_temp) # Make sure we have enough num_files = len(in_files) if me.get_rank()==0 and (verbose == True): print 'Number of files in input directory = ', num_files if (num_files < esize): if me.get_rank()==0 and (verbose == True): print 'Number of files in input directory (',num_files,\ ') is less than specified ensemble size of ', esize sys.exit(2) if (num_files > esize): if me.get_rank()==0 and (verbose == True): print 'NOTE: Number of files in ', input_dir, \ 'is greater than specified ensemble size of ', esize ,\ '\nwill just use the first ', esize, 'files' else: if me.get_rank()==0: print 'Input directory: ',input_dir,' not found' sys.exit(2) if opts_dict['cumul']: if opts_dict['regx']: in_files_list=get_cumul_filelist(opts_dict,opts_dict['indir'],opts_dict['regx']) in_files=me.partition(in_files_list,func=EqualLength(),involved=True) if me.get_rank()==0 and (verbose == True): print 'in_files=',in_files # Open the files in the input directory o_files=[] if me.get_rank() == 0 and opts_dict['verbose']: print 'Input files are: ' print "\n".join(in_files) #for i in in_files: # print "in_files =",i for onefile in in_files[0:esize]: if (os.path.isfile(input_dir+'/' + onefile)): o_files.append(Nio.open_file(input_dir+'/' + onefile,"r")) else: if me.get_rank()==0: print "COULD NOT LOCATE FILE ", input_dir+'/'+onefile , "! EXITING...." sys.exit() # Store dimensions of the input fields if me.get_rank()==0 and (verbose == True): print "Getting spatial dimensions" nlev = -1 nilev = -1 ncol = -1 nlat = -1 nlon = -1 lonkey='' latkey='' # Look at first file and get dims input_dims = o_files[0].dimensions ndims = len(input_dims) for key in input_dims: if key == "lev": nlev = input_dims["lev"] elif key == "ilev": nilev = input_dims["ilev"] elif key == "ncol": ncol = input_dims["ncol"] elif (key == "nlon") or (key =="lon"): nlon = input_dims[key] lonkey=key elif (key == "nlat") or (key == "lat"): nlat = input_dims[key] latkey=key if (nlev == -1) : if me.get_rank()==0: print "COULD NOT LOCATE valid dimension lev => EXITING...." sys.exit() if (( ncol == -1) and ((nlat == -1) or (nlon == -1))): if me.get_rank()==0: print "Need either lat/lon or ncol => EXITING...." sys.exit() # Check if this is SE or FV data if (ncol != -1): is_SE = True else: is_SE = False # Make sure all files have the same dimensions if me.get_rank()==0 and (verbose == True): print "Checking dimensions across files...." print 'lev = ', nlev if (is_SE == True): print 'ncol = ', ncol else: print 'nlat = ', nlat print 'nlon = ', nlon for count, this_file in enumerate(o_files): input_dims = this_file.dimensions if (is_SE == True): if ( nlev != int(input_dims["lev"]) or ( ncol != int(input_dims["ncol"]))): if me.get_rank() == 0: print "Dimension mismatch between ", in_files[0], 'and', in_files[0], '!!!' sys.exit() else: if ( nlev != int(input_dims["lev"]) or ( nlat != int(input_dims[latkey]))\ or ( nlon != int(input_dims[lonkey]))): if me.get_rank() == 0: print "Dimension mismatch between ", in_files[0], 'and', in_files[0], '!!!' sys.exit() # Get 2d vars, 3d vars and all vars (For now include all variables) vars_dict_all = o_files[0].variables # Remove the excluded variables (specified in json file) from variable dictionary #print len(vars_dict_all) if exclude: vars_dict=vars_dict_all for i in ex_varlist: if i in vars_dict: del vars_dict[i] #Given an included var list, remove all float var that are not on the list else: vars_dict=vars_dict_all.copy() for k,v in vars_dict_all.iteritems(): if (k not in inc_varlist) and (vars_dict_all[k].typecode()=='f'): #print vars_dict_all[k].typecode() #print k del vars_dict[k] num_vars = len(vars_dict) #print num_vars #if me.get_rank() == 0: # for k,v in vars_dict.iteritems(): # print 'vars_dict',k,vars_dict[k].typecode() str_size = 0
def main(argv): # Get command line stuff and store in a dictionary s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm gmonly popens cumul regx= startMon= endMon= fIndex=' optkeys = s.split() try: opts, args = getopt.getopt(argv, "h", optkeys) except getopt.GetoptError: pyEnsLib.EnsSum_usage() sys.exit(2) # Put command line options in a dictionary - also set defaults opts_dict = {} # Defaults opts_dict['tag'] = 'cesm2_0_beta08' opts_dict['compset'] = 'F2000' opts_dict['mach'] = 'cheyenne' opts_dict['esize'] = 350 opts_dict['tslice'] = 1 opts_dict['res'] = 'f19_f19' opts_dict['sumfile'] = 'ens.summary.nc' opts_dict['indir'] = './' opts_dict['sumfiledir'] = './' opts_dict['jsonfile'] = 'exclude_empty.json' opts_dict['verbose'] = False opts_dict['mpi_enable'] = False opts_dict['maxnorm'] = False opts_dict['gmonly'] = True opts_dict['popens'] = False opts_dict['cumul'] = False opts_dict['regx'] = 'test' opts_dict['startMon'] = 1 opts_dict['endMon'] = 1 opts_dict['fIndex'] = 151 # This creates the dictionary of input arguments opts_dict = pyEnsLib.getopt_parseconfig(opts, optkeys, 'ES', opts_dict) verbose = opts_dict['verbose'] st = opts_dict['esize'] esize = int(st) if not (opts_dict['tag'] and opts_dict['compset'] and opts_dict['mach'] or opts_dict['res']): print 'Please specify --tag, --compset, --mach and --res options' sys.exit() # Now find file names in indir input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist = [] inc_varlist = [] # Create a mpi simplecomm object if opts_dict['mpi_enable']: me = simplecomm.create_comm() else: me = simplecomm.create_comm(not opts_dict['mpi_enable']) if me.get_rank() == 0: print 'Running pyEnsSum!' if me.get_rank() == 0 and (verbose == True): print opts_dict print 'Ensemble size for summary = ', esize exclude = False if me.get_rank() == 0: if opts_dict['jsonfile']: inc_varlist = [] # Read in the excluded or included var list ex_varlist, exclude = pyEnsLib.read_jsonlist( opts_dict['jsonfile'], 'ES') if exclude == False: inc_varlist = ex_varlist ex_varlist = [] # Read in the included var list #inc_varlist=pyEnsLib.read_jsonlist(opts_dict['jsonfile'],'ES') # Broadcast the excluded var list to each processor #if opts_dict['mpi_enable']: # ex_varlist=me.partition(ex_varlist,func=Duplicate(),involved=True) # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: exclude = me.partition(exclude, func=Duplicate(), involved=True) if exclude: ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) else: inc_varlist = me.partition(inc_varlist, func=Duplicate(), involved=True) in_files = [] if (os.path.exists(input_dir)): # Get the list of files in_files_temp = os.listdir(input_dir) in_files = sorted(in_files_temp) # Make sure we have enough num_files = len(in_files) if me.get_rank() == 0 and (verbose == True): print 'Number of files in input directory = ', num_files if (num_files < esize): if me.get_rank() == 0 and (verbose == True): print 'Number of files in input directory (',num_files,\ ') is less than specified ensemble size of ', esize sys.exit(2) if (num_files > esize): if me.get_rank() == 0 and (verbose == True): print 'NOTE: Number of files in ', input_dir, \ 'is greater than specified ensemble size of ', esize ,\ '\nwill just use the first ', esize, 'files' else: if me.get_rank() == 0: print 'Input directory: ', input_dir, ' not found' sys.exit(2) if opts_dict['cumul']: if opts_dict['regx']: in_files_list = get_cumul_filelist(opts_dict, opts_dict['indir'], opts_dict['regx']) in_files = me.partition(in_files_list, func=EqualLength(), involved=True) if me.get_rank() == 0 and (verbose == True): print 'in_files=', in_files # Open the files in the input directory o_files = [] if me.get_rank() == 0 and opts_dict['verbose']: print 'Input files are: ' print "\n".join(in_files) #for i in in_files: # print "in_files =",i for onefile in in_files[0:esize]: if (os.path.isfile(input_dir + '/' + onefile)): o_files.append(Nio.open_file(input_dir + '/' + onefile, "r")) else: if me.get_rank() == 0: print "COULD NOT LOCATE FILE " + input_dir + onefile + "! EXITING...." sys.exit() # Store dimensions of the input fields if me.get_rank() == 0 and (verbose == True): print "Getting spatial dimensions" nlev = -1 nilev = -1 ncol = -1 nlat = -1 nlon = -1 lonkey = '' latkey = '' # Look at first file and get dims input_dims = o_files[0].dimensions ndims = len(input_dims) for key in input_dims: if key == "lev": nlev = input_dims["lev"] elif key == "ilev": nilev = input_dims["ilev"] elif key == "ncol": ncol = input_dims["ncol"] elif (key == "nlon") or (key == "lon"): nlon = input_dims[key] lonkey = key elif (key == "nlat") or (key == "lat"): nlat = input_dims[key] latkey = key if (nlev == -1): if me.get_rank() == 0: print "COULD NOT LOCATE valid dimension lev => EXITING...." sys.exit() if ((ncol == -1) and ((nlat == -1) or (nlon == -1))): if me.get_rank() == 0: print "Need either lat/lon or ncol => EXITING...." sys.exit() # Check if this is SE or FV data if (ncol != -1): is_SE = True else: is_SE = False # Make sure all files have the same dimensions if me.get_rank() == 0 and (verbose == True): print "Checking dimensions across files...." print 'lev = ', nlev if (is_SE == True): print 'ncol = ', ncol else: print 'nlat = ', nlat print 'nlon = ', nlon for count, this_file in enumerate(o_files): input_dims = this_file.dimensions if (is_SE == True): if (nlev != int(input_dims["lev"]) or (ncol != int(input_dims["ncol"]))): if me.get_rank() == 0: print "Dimension mismatch between ", in_files[ 0], 'and', in_files[0], '!!!' sys.exit() else: if ( nlev != int(input_dims["lev"]) or ( nlat != int(input_dims[latkey]))\ or ( nlon != int(input_dims[lonkey]))): if me.get_rank() == 0: print "Dimension mismatch between ", in_files[ 0], 'and', in_files[0], '!!!' sys.exit() # Get 2d vars, 3d vars and all vars (For now include all variables) vars_dict_all = o_files[0].variables # Remove the excluded variables (specified in json file) from variable dictionary #print len(vars_dict_all) if exclude: vars_dict = vars_dict_all for i in ex_varlist: if i in vars_dict: del vars_dict[i] #Given an included var list, remove all float var that are not on the list else: vars_dict = vars_dict_all.copy() for k, v in vars_dict_all.iteritems(): if (k not in inc_varlist) and (vars_dict_all[k].typecode() == 'f'): #print vars_dict_all[k].typecode() #print k del vars_dict[k] num_vars = len(vars_dict) #print num_vars #if me.get_rank() == 0: # for k,v in vars_dict.iteritems(): # print 'vars_dict',k,vars_dict[k].typecode() str_size = 0 d2_var_names = [] d3_var_names = [] num_2d = 0 num_3d = 0 # Which are 2d, which are 3d and max str_size for k, v in vars_dict.iteritems(): var = k vd = v.dimensions # all the variable's dimensions (names) vr = v.rank # num dimension vs = v.shape # dim values is_2d = False is_3d = False if (is_SE == True): # (time, lev, ncol) or (time, ncol) if ((vr == 2) and (vs[1] == ncol)): is_2d = True num_2d += 1 elif ((vr == 3) and (vs[2] == ncol and vs[1] == nlev)): is_3d = True num_3d += 1 else: # (time, lev, nlon, nlon) or (time, nlat, nlon) if ((vr == 3) and (vs[1] == nlat and vs[2] == nlon)): is_2d = True num_2d += 1 elif ((vr == 4) and (vs[2] == nlat and vs[3] == nlon and (vs[1] == nlev or vs[1] == nilev))): is_3d = True num_3d += 1 if (is_3d == True): str_size = max(str_size, len(k)) d3_var_names.append(k) elif (is_2d == True): str_size = max(str_size, len(k)) d2_var_names.append(k) #else: # print 'var=',k if me.get_rank() == 0 and (verbose == True): print 'Number of variables found: ', num_3d + num_2d print '3D variables: ' + str(num_3d) + ', 2D variables: ' + str(num_2d) # Now sort these and combine (this sorts caps first, then lower case - # which is what we want) d2_var_names.sort() d3_var_names.sort() if esize < num_2d + num_3d: if me.get_rank() == 0: print "************************************************************************************************************************************" print " Error: the total number of 3D and 2D variables " + str( num_2d + num_3d ) + " is larger than the number of ensemble files " + str(esize) print " Cannot generate ensemble summary file, please remove more variables from your included variable list," print " or add more varaibles in your excluded variable list!!!" print "************************************************************************************************************************************" sys.exit() # All vars is 3d vars first (sorted), the 2d vars all_var_names = list(d3_var_names) all_var_names += d2_var_names n_all_var_names = len(all_var_names) #if me.get_rank() == 0 and (verbose == True): # print 'num vars = ', n_all_var_names, '(3d = ', num_3d, ' and 2d = ', num_2d, ")" # Create new summary ensemble file this_sumfile = opts_dict["sumfile"] if me.get_rank() == 0 and (verbose == True): print "Creating ", this_sumfile, " ..." if (me.get_rank() == 0 | opts_dict["popens"]): if os.path.exists(this_sumfile): os.unlink(this_sumfile) opt = Nio.options() opt.PreFill = False opt.Format = 'NetCDF4Classic' nc_sumfile = Nio.open_file(this_sumfile, 'w', options=opt) # Set dimensions if me.get_rank() == 0 and (verbose == True): print "Setting dimensions ....." if (is_SE == True): nc_sumfile.create_dimension('ncol', ncol) else: nc_sumfile.create_dimension('nlat', nlat) nc_sumfile.create_dimension('nlon', nlon) nc_sumfile.create_dimension('nlev', nlev) nc_sumfile.create_dimension('ens_size', esize) nc_sumfile.create_dimension('nvars', num_3d + num_2d) nc_sumfile.create_dimension('nvars3d', num_3d) nc_sumfile.create_dimension('nvars2d', num_2d) nc_sumfile.create_dimension('str_size', str_size) # Set global attributes now = time.strftime("%c") if me.get_rank() == 0 and (verbose == True): print "Setting global attributes ....." setattr(nc_sumfile, 'creation_date', now) setattr(nc_sumfile, 'title', 'CAM verification ensemble summary file') setattr(nc_sumfile, 'tag', opts_dict["tag"]) setattr(nc_sumfile, 'compset', opts_dict["compset"]) setattr(nc_sumfile, 'resolution', opts_dict["res"]) setattr(nc_sumfile, 'machine', opts_dict["mach"]) # Create variables if me.get_rank() == 0 and (verbose == True): print "Creating variables ....." v_lev = nc_sumfile.create_variable("lev", 'f', ('nlev', )) v_vars = nc_sumfile.create_variable("vars", 'S1', ('nvars', 'str_size')) v_var3d = nc_sumfile.create_variable("var3d", 'S1', ('nvars3d', 'str_size')) v_var2d = nc_sumfile.create_variable("var2d", 'S1', ('nvars2d', 'str_size')) if not opts_dict['gmonly']: if (is_SE == True): v_ens_avg3d = nc_sumfile.create_variable( "ens_avg3d", 'f', ('nvars3d', 'nlev', 'ncol')) v_ens_stddev3d = nc_sumfile.create_variable( "ens_stddev3d", 'f', ('nvars3d', 'nlev', 'ncol')) v_ens_avg2d = nc_sumfile.create_variable( "ens_avg2d", 'f', ('nvars2d', 'ncol')) v_ens_stddev2d = nc_sumfile.create_variable( "ens_stddev2d", 'f', ('nvars2d', 'ncol')) else: v_ens_avg3d = nc_sumfile.create_variable( "ens_avg3d", 'f', ('nvars3d', 'nlev', 'nlat', 'nlon')) v_ens_stddev3d = nc_sumfile.create_variable( "ens_stddev3d", 'f', ('nvars3d', 'nlev', 'nlat', 'nlon')) v_ens_avg2d = nc_sumfile.create_variable( "ens_avg2d", 'f', ('nvars2d', 'nlat', 'nlon')) v_ens_stddev2d = nc_sumfile.create_variable( "ens_stddev2d", 'f', ('nvars2d', 'nlat', 'nlon')) v_RMSZ = nc_sumfile.create_variable("RMSZ", 'f', ('nvars', 'ens_size')) v_gm = nc_sumfile.create_variable("global_mean", 'f', ('nvars', 'ens_size')) v_standardized_gm = nc_sumfile.create_variable("standardized_gm", 'f', ('nvars', 'ens_size')) v_loadings_gm = nc_sumfile.create_variable('loadings_gm', 'f', ('nvars', 'nvars')) v_mu_gm = nc_sumfile.create_variable('mu_gm', 'f', ('nvars', )) v_sigma_gm = nc_sumfile.create_variable('sigma_gm', 'f', ('nvars', )) v_sigma_scores_gm = nc_sumfile.create_variable('sigma_scores_gm', 'f', ('nvars', )) # Assign vars, var3d and var2d if me.get_rank() == 0 and (verbose == True): print "Assigning vars, var3d, and var2d ....." eq_all_var_names = [] eq_d3_var_names = [] eq_d2_var_names = [] l_eq = len(all_var_names) for i in range(l_eq): tt = list(all_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_all_var_names.append(tt) l_eq = len(d3_var_names) for i in range(l_eq): tt = list(d3_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_d3_var_names.append(tt) l_eq = len(d2_var_names) for i in range(l_eq): tt = list(d2_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_d2_var_names.append(tt) v_vars[:] = eq_all_var_names[:] v_var3d[:] = eq_d3_var_names[:] v_var2d[:] = eq_d2_var_names[:] # Time-invarient metadata if me.get_rank() == 0 and (verbose == True): print "Assigning time invariant metadata ....." lev_data = vars_dict["lev"] v_lev = lev_data # Form ensembles, each missing one member; compute RMSZs and global means #for each variable, we also do max norm also (currently done in pyStats) tslice = opts_dict['tslice'] if not opts_dict['cumul']: # Partition the var list var3_list_loc = me.partition(d3_var_names, func=EqualStride(), involved=True) var2_list_loc = me.partition(d2_var_names, func=EqualStride(), involved=True) else: var3_list_loc = d3_var_names var2_list_loc = d2_var_names # Calculate global means # if me.get_rank() == 0 and (verbose == True): print "Calculating global means ....." if not opts_dict['cumul']: gm3d, gm2d, var_list = pyEnsLib.generate_global_mean_for_summary( o_files, var3_list_loc, var2_list_loc, is_SE, False, opts_dict) if me.get_rank() == 0 and (verbose == True): print "Finish calculating global means ....." # Calculate RMSZ scores if (not opts_dict['gmonly']) | (opts_dict['cumul']): if me.get_rank() == 0 and (verbose == True): print "Calculating RMSZ scores ....." zscore3d, zscore2d, ens_avg3d, ens_stddev3d, ens_avg2d, ens_stddev2d, temp1, temp2 = pyEnsLib.calc_rmsz( o_files, var3_list_loc, var2_list_loc, is_SE, opts_dict) # Calculate max norm ensemble if opts_dict['maxnorm']: if me.get_rank() == 0 and (verbose == True): print "Calculating max norm of ensembles ....." pyEnsLib.calculate_maxnormens(opts_dict, var3_list_loc) pyEnsLib.calculate_maxnormens(opts_dict, var2_list_loc) if opts_dict['mpi_enable'] & (not opts_dict['popens']): if not opts_dict['cumul']: # Gather the 3d variable results from all processors to the master processor slice_index = get_stride_list(len(d3_var_names), me) # Gather global means 3d results gm3d = gather_npArray(gm3d, me, slice_index, (len(d3_var_names), len(o_files))) if not opts_dict['gmonly']: # Gather zscore3d results zscore3d = gather_npArray(zscore3d, me, slice_index, (len(d3_var_names), len(o_files))) # Gather ens_avg3d and ens_stddev3d results shape_tuple3d = get_shape(ens_avg3d.shape, len(d3_var_names), me.get_rank()) ens_avg3d = gather_npArray(ens_avg3d, me, slice_index, shape_tuple3d) ens_stddev3d = gather_npArray(ens_stddev3d, me, slice_index, shape_tuple3d) # Gather 2d variable results from all processors to the master processor slice_index = get_stride_list(len(d2_var_names), me) # Gather global means 2d results gm2d = gather_npArray(gm2d, me, slice_index, (len(d2_var_names), len(o_files))) var_list = gather_list(var_list, me) if not opts_dict['gmonly']: # Gather zscore2d results zscore2d = gather_npArray(zscore2d, me, slice_index, (len(d2_var_names), len(o_files))) # Gather ens_avg3d and ens_stddev2d results shape_tuple2d = get_shape(ens_avg2d.shape, len(d2_var_names), me.get_rank()) ens_avg2d = gather_npArray(ens_avg2d, me, slice_index, shape_tuple2d) ens_stddev2d = gather_npArray(ens_stddev2d, me, slice_index, shape_tuple2d) else: gmall = np.concatenate((temp1, temp2), axis=0) gmall = pyEnsLib.gather_npArray_pop( gmall, me, (me.get_size(), len(d3_var_names) + len(d2_var_names))) # Assign to file: if me.get_rank() == 0 | opts_dict['popens']: if not opts_dict['cumul']: gmall = np.concatenate((gm3d, gm2d), axis=0) if not opts_dict['gmonly']: Zscoreall = np.concatenate((zscore3d, zscore2d), axis=0) v_RMSZ[:, :] = Zscoreall[:, :] if not opts_dict['gmonly']: if (is_SE == True): v_ens_avg3d[:, :, :] = ens_avg3d[:, :, :] v_ens_stddev3d[:, :, :] = ens_stddev3d[:, :, :] v_ens_avg2d[:, :] = ens_avg2d[:, :] v_ens_stddev2d[:, :] = ens_stddev2d[:, :] else: v_ens_avg3d[:, :, :, :] = ens_avg3d[:, :, :, :] v_ens_stddev3d[:, :, :, :] = ens_stddev3d[:, :, :, :] v_ens_avg2d[:, :, :] = ens_avg2d[:, :, :] v_ens_stddev2d[:, :, :] = ens_stddev2d[:, :, :] else: gmall_temp = np.transpose(gmall[:, :]) gmall = gmall_temp mu_gm, sigma_gm, standardized_global_mean, loadings_gm, scores_gm = pyEnsLib.pre_PCA( gmall, all_var_names, var_list, me) v_gm[:, :] = gmall[:, :] v_standardized_gm[:, :] = standardized_global_mean[:, :] v_mu_gm[:] = mu_gm[:] v_sigma_gm[:] = sigma_gm[:].astype(np.float32) v_loadings_gm[:, :] = loadings_gm[:, :] v_sigma_scores_gm[:] = scores_gm[:] if me.get_rank() == 0: print "All Done"
def testPartitionInt(self): data = 13 + self.rank sresult = self.scomm.partition(data, func=Duplicate()) presult = self.pcomm.partition(data, func=Duplicate()) self.assertEqual(sresult, presult)
def main(argv): # Get command line stuff and store in a dictionary s = 'tag= compset= esize= tslice= res= sumfile= indir= sumfiledir= mach= verbose jsonfile= mpi_enable maxnorm gmonly popens cumul regx= startMon= endMon= fIndex= mpi_disable' optkeys = s.split() try: opts, args = getopt.getopt(argv, "h", optkeys) except getopt.GetoptError: pyEnsLib.EnsSum_usage() sys.exit(2) # Put command line options in a dictionary - also set defaults opts_dict = {} # Defaults opts_dict['tag'] = 'cesm2_0' opts_dict['compset'] = 'F2000climo' opts_dict['mach'] = 'cheyenne' opts_dict['esize'] = 350 opts_dict['tslice'] = 1 opts_dict['res'] = 'f19_f19' opts_dict['sumfile'] = 'ens.summary.nc' opts_dict['indir'] = './' opts_dict['sumfiledir'] = './' opts_dict['jsonfile'] = 'exclude_empty.json' opts_dict['verbose'] = False opts_dict['mpi_enable'] = True opts_dict['mpi_disable'] = False opts_dict['maxnorm'] = False opts_dict['gmonly'] = True opts_dict['popens'] = False opts_dict['cumul'] = False opts_dict['regx'] = 'test' opts_dict['startMon'] = 1 opts_dict['endMon'] = 1 opts_dict['fIndex'] = 151 # This creates the dictionary of input arguments opts_dict = pyEnsLib.getopt_parseconfig(opts, optkeys, 'ES', opts_dict) verbose = opts_dict['verbose'] st = opts_dict['esize'] esize = int(st) if opts_dict['popens'] == True: print( "ERROR: Please use pyEnsSumPop.py for a POP ensemble (not --popens) => EXITING...." ) sys.exit() if not (opts_dict['tag'] and opts_dict['compset'] and opts_dict['mach'] or opts_dict['res']): print( 'ERROR: Please specify --tag, --compset, --mach and --res options => EXITING....' ) sys.exit() if opts_dict['mpi_disable'] == True: opts_dict['mpi_enable'] = False # Now find file names in indir input_dir = opts_dict['indir'] # The var list that will be excluded ex_varlist = [] inc_varlist = [] # Create a mpi simplecomm object if opts_dict['mpi_enable']: me = simplecomm.create_comm() else: me = simplecomm.create_comm(not opts_dict['mpi_enable']) if me.get_rank() == 0: print('STATUS: Running pyEnsSum.py') if me.get_rank() == 0 and (verbose == True): print(opts_dict) print('STATUS: Ensemble size for summary = ', esize) exclude = False if me.get_rank() == 0: if opts_dict['jsonfile']: inc_varlist = [] # Read in the excluded or included var list ex_varlist, exclude = pyEnsLib.read_jsonlist( opts_dict['jsonfile'], 'ES') if exclude == False: inc_varlist = ex_varlist ex_varlist = [] # Broadcast the excluded var list to each processor if opts_dict['mpi_enable']: exclude = me.partition(exclude, func=Duplicate(), involved=True) if exclude: ex_varlist = me.partition(ex_varlist, func=Duplicate(), involved=True) else: inc_varlist = me.partition(inc_varlist, func=Duplicate(), involved=True) in_files = [] if (os.path.exists(input_dir)): # Get the list of files in_files_temp = os.listdir(input_dir) in_files = sorted(in_files_temp) # Make sure we have enough num_files = len(in_files) if me.get_rank() == 0 and (verbose == True): print('VERBOSE: Number of files in input directory = ', num_files) if (num_files < esize): if me.get_rank() == 0 and (verbose == True): print('VERBOSE: Number of files in input directory (',num_files,\ ') is less than specified ensemble size of ', esize) sys.exit(2) if (num_files > esize): if me.get_rank() == 0 and (verbose == True): print('VERBOSE: Note that the number of files in ', input_dir, \ 'is greater than specified ensemble size of ', esize ,\ '\nwill just use the first ', esize, 'files') else: if me.get_rank() == 0: print('ERROR: Input directory: ', input_dir, ' not found') sys.exit(2) if opts_dict['cumul']: if opts_dict['regx']: in_files_list = get_cumul_filelist(opts_dict, opts_dict['indir'], opts_dict['regx']) in_files = me.partition(in_files_list, func=EqualLength(), involved=True) if me.get_rank() == 0 and (verbose == True): print('VERBOSE: in_files = ', in_files) # Check full file names in input directory (don't open yet) full_in_files = [] if me.get_rank() == 0 and opts_dict['verbose']: print('VERBOSE: Input files are: ') for onefile in in_files[0:esize]: fname = input_dir + '/' + onefile if me.get_rank() == 0 and opts_dict['verbose']: print(fname) if (os.path.isfile(fname)): full_in_files.append(fname) else: if me.get_rank() == 0: print("ERROR: Could not locate file ", fname, " => EXITING....") sys.exit() #open just the first file first_file = nc.Dataset(full_in_files[0], "r") # Store dimensions of the input fields if me.get_rank() == 0 and (verbose == True): print("VERBOSE: Getting spatial dimensions") nlev = -1 nilev = -1 ncol = -1 nlat = -1 nlon = -1 lonkey = '' latkey = '' # Look at first file and get dims input_dims = first_file.dimensions ndims = len(input_dims) for key in input_dims: if key == "lev": nlev = len(input_dims["lev"]) elif key == "ilev": nilev = len(input_dims["ilev"]) elif key == "ncol": ncol = len(input_dims["ncol"]) elif (key == "nlon") or (key == "lon"): nlon = len(input_dims[key]) lonkey = key elif (key == "nlat") or (key == "lat"): nlat = len(input_dims[key]) latkey = key if (nlev == -1): if me.get_rank() == 0: print( "ERROR: could not locate a valid dimension (lev) => EXITING...." ) sys.exit() if ((ncol == -1) and ((nlat == -1) or (nlon == -1))): if me.get_rank() == 0: print("ERROR: Need either lat/lon or ncol => EXITING....") sys.exit() # Check if this is SE or FV data if (ncol != -1): is_SE = True else: is_SE = False # output dimensions if me.get_rank() == 0 and (verbose == True): print('lev = ', nlev) if (is_SE == True): print('ncol = ', ncol) else: print('nlat = ', nlat) print('nlon = ', nlon) # Get 2d vars, 3d vars and all vars (For now include all variables) vars_dict_all = first_file.variables # Remove the excluded variables (specified in json file) from variable dictionary if exclude: vars_dict = vars_dict_all for i in ex_varlist: if i in vars_dict: del vars_dict[i] #Given an included var list, remove all the variables that are not on the list else: vars_dict = vars_dict_all.copy() for k, v in vars_dict_all.items(): if (k not in inc_varlist) and (vars_dict_all[k].typecode() == 'f'): del vars_dict[k] num_vars = len(vars_dict) str_size = 0 d2_var_names = [] d3_var_names = [] num_2d = 0 num_3d = 0 # Which are 2d, which are 3d and max str_size for k, v in vars_dict.items(): var = k vd = v.dimensions # all the variable's dimensions (names) vr = len(v.dimensions) # num dimension vs = v.shape # dim values is_2d = False is_3d = False if (is_SE == True): # (time, lev, ncol) or (time, ncol) if ((vr == 2) and (vs[1] == ncol)): is_2d = True num_2d += 1 elif ((vr == 3) and (vs[2] == ncol and vs[1] == nlev)): is_3d = True num_3d += 1 else: # (time, lev, nlon, nlon) or (time, nlat, nlon) if ((vr == 3) and (vs[1] == nlat and vs[2] == nlon)): is_2d = True num_2d += 1 elif ((vr == 4) and (vs[2] == nlat and vs[3] == nlon and (vs[1] == nlev or vs[1] == nilev))): is_3d = True num_3d += 1 if (is_3d == True): str_size = max(str_size, len(k)) d3_var_names.append(k) elif (is_2d == True): str_size = max(str_size, len(k)) d2_var_names.append(k) if me.get_rank() == 0 and (verbose == True): print('VERBOSE: Number of variables found: ', num_3d + num_2d) print('VERBOSE: 3D variables: ' + str(num_3d) + ', 2D variables: ' + str(num_2d)) # Now sort these and combine (this sorts caps first, then lower case - # which is what we want) d2_var_names.sort() d3_var_names.sort() if esize < num_2d + num_3d: if me.get_rank() == 0: print( "************************************************************************************************************************************" ) print(" ERROR: the total number of 3D and 2D variables " + str(num_2d + num_3d) + " is larger than the number of ensemble files " + str(esize)) print( " Cannot generate ensemble summary file, please remove more variables from your included variable list," ) print( " or add more variables in your excluded variable list => EXITING...." ) print( "************************************************************************************************************************************" ) sys.exit() # All vars is 3d vars first (sorted), the 2d vars all_var_names = list(d3_var_names) all_var_names += d2_var_names n_all_var_names = len(all_var_names) # Rank 0 - Create new summary ensemble file this_sumfile = opts_dict["sumfile"] #check if directory is valid sum_dir = os.path.dirname(this_sumfile) if len(sum_dir) == 0: sum_dir = '.' if (os.path.exists(sum_dir) == False): if me.get_rank() == 0: print('ERROR: Summary file directory: ', sum_dir, ' not found') sys.exit(2) if (me.get_rank() == 0): if (verbose == True): print("VERBOSE: Creating ", this_sumfile, " ...") if os.path.exists(this_sumfile): os.unlink(this_sumfile) nc_sumfile = nc.Dataset(this_sumfile, "w", format="NETCDF4_CLASSIC") # Set dimensions if (verbose == True): print("VERBOSE: Setting dimensions .....") if (is_SE == True): nc_sumfile.createDimension('ncol', ncol) else: nc_sumfile.createDimension('nlat', nlat) nc_sumfile.createDimension('nlon', nlon) nc_sumfile.createDimension('nlev', nlev) nc_sumfile.createDimension('ens_size', esize) nc_sumfile.createDimension('nvars', num_3d + num_2d) nc_sumfile.createDimension('nvars3d', num_3d) nc_sumfile.createDimension('nvars2d', num_2d) nc_sumfile.createDimension('str_size', str_size) # Set global attributes now = time.strftime("%c") if (verbose == True): print("VERBOSE: Setting global attributes .....") nc_sumfile.creation_date = now nc_sumfile.title = 'CAM verification ensemble summary file' nc_sumfile.tag = opts_dict["tag"] nc_sumfile.compset = opts_dict["compset"] nc_sumfile.resolution = opts_dict["res"] nc_sumfile.machine = opts_dict["mach"] # Create variables if (verbose == True): print("VERBOSE: Creating variables .....") v_lev = nc_sumfile.createVariable("lev", 'f8', ('nlev', )) v_vars = nc_sumfile.createVariable("vars", 'S1', ('nvars', 'str_size')) v_var3d = nc_sumfile.createVariable("var3d", 'S1', ('nvars3d', 'str_size')) v_var2d = nc_sumfile.createVariable("var2d", 'S1', ('nvars2d', 'str_size')) v_gm = nc_sumfile.createVariable("global_mean", 'f8', ('nvars', 'ens_size')) v_standardized_gm = nc_sumfile.createVariable("standardized_gm", 'f8', ('nvars', 'ens_size')) v_loadings_gm = nc_sumfile.createVariable('loadings_gm', 'f8', ('nvars', 'nvars')) v_mu_gm = nc_sumfile.createVariable('mu_gm', 'f8', ('nvars', )) v_sigma_gm = nc_sumfile.createVariable('sigma_gm', 'f8', ('nvars', )) v_sigma_scores_gm = nc_sumfile.createVariable('sigma_scores_gm', 'f8', ('nvars', )) # Assign vars, var3d and var2d if (verbose == True): print("VERBOSE: Assigning vars, var3d, and var2d .....") eq_all_var_names = [] eq_d3_var_names = [] eq_d2_var_names = [] l_eq = len(all_var_names) for i in range(l_eq): tt = list(all_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_all_var_names.append(tt) l_eq = len(d3_var_names) for i in range(l_eq): tt = list(d3_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_d3_var_names.append(tt) l_eq = len(d2_var_names) for i in range(l_eq): tt = list(d2_var_names[i]) l_tt = len(tt) if (l_tt < str_size): extra = list(' ') * (str_size - l_tt) tt.extend(extra) eq_d2_var_names.append(tt) v_vars[:] = eq_all_var_names[:] v_var3d[:] = eq_d3_var_names[:] v_var2d[:] = eq_d2_var_names[:] # Time-invarient metadata if (verbose == True): print("VERBOSE: Assigning time invariant metadata .....") # lev_data = np.zeros(num_lev,dtype=np.float64) lev_data = first_file.variables["lev"] v_lev[:] = lev_data[:] #end of rank=0 work # All: tslice = opts_dict['tslice'] if not opts_dict['cumul']: # Partition the var list var3_list_loc = me.partition(d3_var_names, func=EqualStride(), involved=True) var2_list_loc = me.partition(d2_var_names, func=EqualStride(), involved=True) else: var3_list_loc = d3_var_names var2_list_loc = d2_var_names #close first_file first_file.close() # Calculate global means # if me.get_rank() == 0 and (verbose == True): print("VERBOSE: Calculating global means .....") if not opts_dict['cumul']: gm3d, gm2d, var_list = pyEnsLib.generate_global_mean_for_summary( full_in_files, var3_list_loc, var2_list_loc, is_SE, False, opts_dict) if me.get_rank() == 0 and (verbose == True): print("VERBOSE: Finished calculating global means .....") #gather to rank = 0 if opts_dict['mpi_enable']: if not opts_dict['cumul']: # Gather the 3d variable results from all processors to the master processor slice_index = get_stride_list(len(d3_var_names), me) # Gather global means 3d results gm3d = gather_npArray(gm3d, me, slice_index, (len(d3_var_names), len(full_in_files))) # Gather 2d variable results from all processors to the master processor slice_index = get_stride_list(len(d2_var_names), me) # Gather global means 2d results gm2d = gather_npArray(gm2d, me, slice_index, (len(d2_var_names), len(full_in_files))) #gather variables ro exclude (in pre_pca) var_list = gather_list(var_list, me) else: gmall = np.concatenate((temp1, temp2), axis=0) gmall = pyEnsLib.gather_npArray_pop( gmall, me, (me.get_size(), len(d3_var_names) + len(d2_var_names))) # rank =0 : complete calculations for summary file if me.get_rank() == 0: if not opts_dict['cumul']: gmall = np.concatenate((gm3d, gm2d), axis=0) else: gmall_temp = np.transpose(gmall[:, :]) gmall = gmall_temp #PCA prep and calculation mu_gm, sigma_gm, standardized_global_mean, loadings_gm, scores_gm, b_exit = pyEnsLib.pre_PCA( gmall, all_var_names, var_list, me) #if PCA calc encounters an error, then remove the summary file and exit if b_exit: nc_sumfile.close() os.unlink(this_sumfile) print("STATUS: Summary could not be created.") sys.exit(2) v_gm[:, :] = gmall[:, :] v_standardized_gm[:, :] = standardized_global_mean[:, :] v_mu_gm[:] = mu_gm[:] v_sigma_gm[:] = sigma_gm[:] v_loadings_gm[:, :] = loadings_gm[:, :] v_sigma_scores_gm[:] = scores_gm[:] print("STATUS: Summary file is complete.") nc_sumfile.close()
def _inspect_input_files(self): """ Inspect the input data files themselves. We check the file contents here, which means opening and reading heading information from the files. """ # Set the I/O backend according to what is specified iobackend.set_backend(self._backend) # Initialize the list of variable names for each category udim = None timeta = [] xtra_timeta = [] tvmeta = [] # Initialize the local dictionary of time-series variables and sizes all_tsvars = {} file_times = {} #===== INSPECT FIRST INPUT FILE (ON MASTER PROCESS ONLY) ===== # Open first file if self._simplecomm.is_manager(): ifile = iobackend.NCFile(self._input_filenames[0]) # Look for the 'unlimited' dimension try: udim = next( dim for dim in ifile.dimensions if ifile.unlimited(dim)) except StopIteration: err_msg = 'Unlimited dimension not found.' raise LookupError(err_msg) # Get the first file's time values file_times[self._input_filenames[0]] = ifile.variables[udim][:] # Categorize each variable (only looking at first file) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions: if var_name not in self._exclude_list: timeta.append(var_name) elif var_name in self._metadata_names or (self._1d_metadata and len(var.dimensions) == 1): tvmeta.append(var_name) elif self._time_series_names is None or var_name in self._time_series_names: all_tsvars[var_name] = var.datatype.itemsize * var.size # Close the first file ifile.close() # Find variables only in the metadata file if self._metadata_filename is not None: ifile = iobackend.NCFile(self._metadata_filename) for var_name, var in ifile.variables.iteritems(): if udim not in var.dimensions and var_name not in timeta: xtra_timeta.append(var_name) ifile.close() self._simplecomm.sync() # Send information to worker processes self._unlimited_dim = self._simplecomm.partition( udim, func=Duplicate(), involved=True) self._time_invariant_metadata = self._simplecomm.partition( timeta, func=Duplicate(), involved=True) self._time_invariant_metafile_vars = self._simplecomm.partition( xtra_timeta, func=Duplicate(), involved=True) self._time_variant_metadata = self._simplecomm.partition( tvmeta, func=Duplicate(), involved=True) all_tsvars = self._simplecomm.partition( all_tsvars, func=Duplicate(), involved=True) self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' First input file inspected.', verbosity=2) #===== INSPECT REMAINING INPUT FILES (IN PARALLEL) ===== # Get the list of variable names and missing variables var_names = set( all_tsvars.keys() + self._time_invariant_metadata + self._time_invariant_metafile_vars + self._time_variant_metadata) missing_vars = set() # Partition the remaining filenames to inspect input_filenames = self._simplecomm.partition( self._input_filenames[1:], func=EqualStride(), involved=True) # Make a pass through remaining files and: # (1) Make sure it has the 'unlimited' dimension # (2) Make sure this dimension is truely 'unlimited' # (3) Check that this dimension has a corresponding variable # (4) Check if there are any missing variables # (5) Get the time values from the files for ifilename in input_filenames: ifile = iobackend.NCFile(ifilename) # Determine the unlimited dimension if self._unlimited_dim not in ifile.dimensions: err_msg = 'Unlimited dimension not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) if not ifile.unlimited(self._unlimited_dim): err_msg = 'Dimension "{0}" not unlimited in file "{1}"'.format( self._unlimited_dim, ifilename) raise LookupError(err_msg) if self._unlimited_dim not in ifile.variables: err_msg = 'Unlimited dimension variable not found in file "{0}"'.format( ifilename) raise LookupError(err_msg) # Get the time values (list of NDArrays) file_times[ifilename] = ifile.variables[self._unlimited_dim][:] # Get the missing variables var_names_next = set(ifile.variables.keys()) missing_vars.update(var_names - var_names_next) # Close the file ifile.close() self._simplecomm.sync() if self._simplecomm.is_manager(): self._vprint(' Remaining input files inspected.', verbosity=2) #===== CHECK FOR MISSING VARIABLES ===== # Gather all missing variables on the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): missing_vars.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(missing_vars) self._simplecomm.sync() # Check for missing variables only on master process if self._simplecomm.is_manager(): # Remove metafile variables from missing vars set missing_vars -= set(self._time_invariant_metafile_vars) # Make sure that the list of variables in each file is the same if len(missing_vars) != 0: warning = ("WARNING: Some variables are not in all input files:{0} " "{1}").format(linesep, ', '.join(sorted(missing_vars))) self._vprint(warning, header=False, verbosity=0) self._vprint(' Checked for missing variables.', verbosity=2) #===== SORT INPUT FILES BY TIME ===== # Gather the file time values onto the master process if self._simplecomm.get_size() > 1: if self._simplecomm.is_manager(): for _ in range(1, self._simplecomm.get_size()): file_times.update(self._simplecomm.collect()[1]) else: self._simplecomm.collect(file_times) self._simplecomm.sync() # Check the order of the input files based on the time values if self._simplecomm.is_manager(): # Determine the sort order based on the first time in the time # values old_order = range(len(self._input_filenames)) new_order = sorted( old_order, key=lambda i: file_times[self._input_filenames[i]][0]) # Re-order the list of input filenames and time values new_filenames = [self._input_filenames[i] for i in new_order] new_values = [file_times[self._input_filenames[i]] for i in new_order] # Now, check that the largest time in each file is less than the smallest time # in the next file (so that the time spans of each file do not # overlap) for i in xrange(1, len(new_values)): if new_values[i - 1][-1] >= new_values[i][0]: err_msg = ('Times in input files {0} and {1} appear to ' 'overlap').format(new_filenames[i - 1], new_filenames[i]) raise ValueError(err_msg) else: new_filenames = None # Now that this is validated, save the time values and filename in the # new order self._input_filenames = self._simplecomm.partition( new_filenames, func=Duplicate(), involved=True) if self._simplecomm.is_manager(): self._vprint(' Input files sorted by time.', verbosity=2) #===== FINALIZING OUTPUT ===== self._simplecomm.sync() # Debug output if self._simplecomm.is_manager(): self._vprint(' Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metadata)), verbosity=1) if len(self._time_invariant_metafile_vars) > 0: self._vprint(' Additional Time-Invariant Metadata: {0}'.format( ', '.join(self._time_invariant_metafile_vars)), verbosity=1) self._vprint(' Time-Variant Metadata: {0}'.format( ', '.join(self._time_variant_metadata)), verbosity=1) self._vprint( ' Time-Series Variables: {0}'.format(', '.join(all_tsvars.keys())), verbosity=1) # Add 'once' variable if writing to a once file # NOTE: This is a "cheat"! There is no 'once' variable. It's just # a catch for all metadata IFF the 'once-file' is enabled. if self._use_once_file: all_tsvars['once'] = max(all_tsvars.values()) # Partition the time-series variables across processors self._time_series_variables = self._simplecomm.partition( all_tsvars.items(), func=WeightBalanced(), involved=True)
def main(argv=None): args = cli(argv) # Create the necessary SimpleComm scomm = create_comm(serial=args.serial) # Do setup only on manager node if scomm.is_manager(): # Check that the specfile exists if not exists(args.stdfile): raise OSError(("Output specification file {!r} not " "found").format(args.stdfile)) # Read the specfile into a dictionary print("Reading standardization file: {}".format(args.stdfile)) dsdict = json_load(open(args.stdfile, "r"), object_pairs_hook=OrderedDict) # Parse the output Dataset print( "Creating output dataset descriptor from standardization file...") outds = OutputDatasetDesc(dsdict=dsdict) else: outds = None # Send the output descriptor to all nodes outds = scomm.partition(outds, func=Duplicate(), involved=True) # Sync scomm.sync() # Continue setup only on manager node if scomm.is_manager(): # Gather the list of input files infiles = [] for infile in args.infiles: infiles.extend(glob(infile)) # If no input files, stop here if len(infiles) == 0: print("Standardization file validated.") return # Parse the input Dataset print( "Creating input dataset descriptor from {} input files...".format( len(infiles))) inpds = InputDatasetDesc(filenames=infiles) else: inpds = None # Send the input descriptor to all nodes inpds = scomm.partition(inpds, func=Duplicate(), involved=True) # Sync and continue process on all nodes scomm.sync() # Check for warn/error if args.error: simplefilter("error", ValidationWarning) # Try importing all of the necessary user-defined modules if args.module is not None: for i, modpath in enumerate(args.module): load_source("user{}".format(i), modpath) # Setup the PyConform data flow on all nodes if scomm.is_manager(): print("Creating the data flow...") dataflow = DataFlow(inpds, outds) # Execute the data flow (write to files) history = not args.no_history dataflow.execute( chunks=dict(args.chunks), scomm=scomm, history=history, deflate=args.deflate, debug=args.debug, )