class flag_baseline(BaseRecipe, RemoteCommandRecipeMixIn): """ Accept a list of baselines (in the format used by NDPPP logging). Flag them in all MeasurementSets. """ inputs = { 'baselines': ingredient.ListField('--baselines', help="Baselines (in NDPPP format, eg 1&1)"), 'nproc': ingredient.IntField( '--nproc', help="Maximum number of simultaneous processes per compute node", default=8) } outputs = {'mapfile': ingredient.FileField()} def go(self): self.logger.info("Starting flag_baseline run") super(flag_baseline, self).go() # Serialise list of baselines to disk for compute nodes to pick up # ---------------------------------------------------------------------- fd, baseline_filename = mkstemp( dir=self.config.get("layout", "job_directory")) baseline_file = os.fdopen(fd, "w") dump(self.inputs["baselines"], baseline_file) baseline_file.close() # try block ensures baseline_filename is always unlinked # ---------------------------------------------------------------------- try: # Load file <-> compute node mapping from disk # ------------------------------------------------------------------ self.logger.debug("Loading map from %s" % self.inputs['args'][0]) data = load_data_map(self.inputs['args'][0]) command = "python %s" % (self.__file__.replace('master', 'nodes')) jobs = [] for host, ms in data: jobs.append( ComputeJob(host, command, arguments=[ms, baseline_filename])) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) finally: os.unlink(baseline_filename) if self.error.isSet(): return 1 else: self.outputs['mapfile'] = self.inputs['args'][0] return 0
class thumbnail_combine(BaseRecipe, RemoteCommandRecipeMixIn): inputs = { 'executable': ingredient.ExecField('--executable', default="/usr/bin/montage", help="montage executable"), 'file_pattern': ingredient.StringField( '--file-pattern', default="*.th.png", help="File search pattern (glob)", ), 'input_dir': ingredient.StringField('--input-dir', help="Directory containing input files"), 'output_file': ingredient.StringField('--output-file', help="Output filename"), 'clobber': ingredient.BoolField('--clobber', default=False, help="Clobber pre-existing output files"), 'target_hosts': ingredient.ListField('--target-hosts', help="Remote hosts on which to execute") } def go(self): self.logger.info("Starting thumbnail_combine run") super(thumbnail_combine, self).go() hosts = self.inputs['target_hosts'] command = "python %s" % (self.__file__.replace('master', 'nodes')) jobs = [] for host in hosts: jobs.append( ComputeJob(host, command, arguments=[ self.inputs['executable'], self.inputs['file_pattern'], self.inputs['input_dir'], self.inputs['output_file'], self.inputs['clobber'] ])) self._schedule_jobs(jobs) if self.error.isSet(): self.logger.warn("Failed compute job process detected") return 1 else: return 0
class cimager(BaseRecipe, RemoteCommandRecipeMixIn): """ Provides a convenient, pipeline-based mechanism of running the cimager on a dataset. Can ingest either an MWimager-style parset, converting to cimager format as required, or a cimager parset directly. **Arguments** A mapfile describing the data to be processed. """ inputs = { 'imager_exec': ingredient.ExecField('--imager-exec', help="cimager executable"), 'convert_exec': ingredient.ExecField('--convert-exec', help="convertimagerparset executable"), 'parset': ingredient.FileField( '--parset', help="Imager configuration parset (mwimager or cimager format)"), 'nproc': ingredient.IntField( '--nproc', help="Maximum number of simultaneous processes per compute node", default=8), 'timestep': ingredient.FloatField( '--timestep', help= "If non-zero, multiple images will be made, each using timestep seconds of data", default=0.0), 'results_dir': ingredient.DirectoryField( '--results-dir', help="Directory in which resulting images will be placed", ), 'parset_type': ParsetTypeField('--parset-type', default="mwimager", help="cimager or mwimager"), 'makevds': ingredient.ExecField('--makevds', help="makevds executable", default="/opt/LofIm/daily/lofar/bin/makevds"), 'combinevds': ingredient.ExecField('--comebinevds', help="combinevds executable", default="/opt/LofIm/daily/lofar/bin/combinevds") } outputs = {'images': ingredient.ListField()} def go(self): self.logger.info("Starting cimager run") super(cimager, self).go() self.outputs['images'] = [] # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for cimager") gvds_file = os.path.join(self.config.get("layout", "job_directory"), "vds", "cimager.gvds") inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = gvds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(gvds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("cimager GVDS is %s" % (gvds_file, )) # Read data for processing from the GVDS file # ---------------------------------------------------------------------- parset = Parset(gvds_file) data = [] for part in range(parset.getInt('NParts')): host = parset.getString("Part%d.FileSys" % part).split(":")[0] vds = parset.getString("Part%d.Name" % part) data.append((host, vds)) # Divide data into timesteps for imaging # timesteps is a list of (start, end, results directory) tuples # ---------------------------------------------------------------------- timesteps = [] results_dir = self.inputs['results_dir'] if self.inputs['timestep'] == 0: self.logger.info("No timestep specified; imaging all data") timesteps = [(None, None, results_dir)] else: self.logger.info("Using timestep of %s s" % self.inputs['timestep']) gvds = get_parset(gvds_file) start_time = quantity(gvds['StartTime'].get()).get('s').get_value() end_time = quantity(gvds['EndTime'].get()).get('s').get_value() step = float(self.inputs['timestep']) while start_time < end_time: timesteps.append((start_time, start_time + step, os.path.join(results_dir, str(start_time)))) start_time += step # Run each cimager process in a separate thread # ---------------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) for label, timestep in enumerate(timesteps): self.logger.info("Processing timestep %d" % label) jobs = [] parsets = [] start_time, end_time, resultsdir = timestep for host, vds in data: vds_data = Parset(vds) frequency_range = [ vds_data.getDoubleVector("StartFreqs")[0], vds_data.getDoubleVector("EndFreqs")[-1] ] parsets.append( self.__get_parset( os.path.basename( vds_data.getString('FileName')).split('.')[0], vds_data.getString("FileName"), str(frequency_range), vds_data.getStringVector("Extra.FieldDirectionType") [0], vds_data.getStringVector("Extra.FieldDirectionRa")[0], vds_data.getStringVector("Extra.FieldDirectionDec")[0], 'True', # cimager bug: non-restored image unusable )) jobs.append( ComputeJob(host, command, arguments=[ self.inputs['imager_exec'], vds, parsets[-1], resultsdir, start_time, end_time ])) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) for parset in parsets: parset = Parset(parset) image_names = parset.getStringVector("Cimager.Images.Names") self.outputs['images'].extend(image_names) [os.unlink(parset) for parset in parsets] # Check if we recorded a failing process before returning # ---------------------------------------------------------------------- if self.error.isSet(): self.logger.warn("Failed imager process detected") return 1 else: return 0 def __get_parset(self, name, dataset, frequency, ms_dir_type, ms_dir_ra, ms_dir_dec, restore): def convert_mwimager_parset(parset): try: with patched_parset( parset, { 'dataset': dataset, 'Images.frequency': frequency, 'msDirType': ms_dir_type, 'msDirRa': ms_dir_ra, 'msDirDec': ms_dir_dec, 'restore': restore # cimager bug: non-restored image unusable }) as cimager_parset: fd, converted_parset = tempfile.mkstemp( dir=self.config.get("layout", "job_directory")) convert_process = spawn_process([ self.inputs['convert_exec'], cimager_parset, converted_parset ], self.logger) os.close(fd) sout, serr = convert_process.communicate() log_process_output(self.inputs['convert_exec'], sout, serr, self.logger) if convert_process.returncode != 0: raise subprocess.CalledProcessError( convert_process.returncode, convert_exec) return converted_parset except OSError as e: self.logger.error("Failed to spawn convertimagerparset (%s)" % str(e)) raise except subprocess.CalledProcessError as e: self.logger.error(str(e)) raise def populate_cimager_parset(parset): input_parset = Parset(parset) patch_dictionary = { 'Cimager.dataset': dataset, 'Cimager.restore': restore } image_names = [] for image_name in input_parset.getStringVector( 'Cimager.Images.Names'): image_names.append("%s_%s" % (image_name, name)) subset = input_parset.makeSubset( "Cimager.Images.%s" % image_name, "Cimager.Images.%s" % image_names[-1]) patch_dictionary["Cimager.Images.%s.frequency" % image_names[-1]] = frequency patch_dictionary["Cimager.Images.%s.direction" % image_names[-1]] = "[ %s,%s,%s ]" % ( ms_dir_ra, ms_dir_dec, ms_dir_type) for key in subset: patch_dictionary[key] = subset[key].get() input_parset.subtractSubset('Cimager.Images.image') for key in input_parset: patch_dictionary[key] = input_parset[key].get() patch_dictionary['Cimager.Images.Names'] = "[ %s ]" % ", ".join( image_names) return patch_parset(None, patch_dictionary, self.config.get("layout", "job_directory")) try: if self.inputs['parset_type'] == "mwimager": cimager_parset = convert_mwimager_parset(self.inputs['parset']) elif self.inputs['parset_type'] == "cimager": cimager_parset = populate_cimager_parset(self.inputs['parset']) except Exception as e: self.logger.exception("Failed to generate imager parset") raise return cimager_parset
class dppp(BaseRecipe, RemoteCommandRecipeMixIn): """ Runs ``NDPPP`` on a number of MeasurementSets. This is used for averaging, and/or flagging, and/or demixing of data. 1. Load input data files 2. Load parmdb and sourcedb 3. Call the node side of the recipe 4. Create mapfile with successful noderecipe runs **Command line arguments** 1. A mapfile describing the data to be processed. 2. Optionally, a mapfile with target output locations. """ inputs = { 'parset': ingredient.FileField( '-p', '--parset', help="The full path to a DPPP configuration parset. The ``msin`` " "and ``msout`` keys will be added by this recipe"), 'executable': ingredient.ExecField( '--executable', help="The full path to the relevant DPPP executable"), 'suffix': ingredient.StringField( '--suffix', default=".dppp", help="Added to the input filename to generate the output filename" ), 'working_directory': ingredient.StringField( '-w', '--working-directory', help="Working directory used on output nodes. Results will be " "written here"), 'mapfile': ingredient.StringField( '--mapfile', help="Name of the output mapfile containing the names of the " "MS-files produced by the DPPP recipe"), 'parmdb_mapfile': ingredient.StringField( '--parmdb-mapfile', optional=True, help="Path to mapfile containing the parmdb files " "(used by demixing step only)"), 'sourcedb_mapfile': ingredient.StringField( '--sourcedb-mapfile', optional=True, help="Path to mapfile containing the sourcedb files " "(used by demixing step only)"), 'demix_always': ingredient.ListField( '--demix-always', help="List of sources that must always be demixed " "(used by demixing step only)", default=[]), 'demix_if_needed': ingredient.ListField( '--demix-if-needed', help="List of sources that will only be demixed if needed, " "based on some heuristics (used by demixing step only)", default=[]), # NB times are read from vds file as string 'data_start_time': ingredient.StringField( '--data-start-time', default="", help="Start time to be passed to DPPP; used to pad data"), 'data_end_time': ingredient.StringField( '--data-end-time', default="", help="End time to be passed to DPPP; used to pad data"), 'nproc': ingredient.IntField( '--nproc', default=8, help="Maximum number of simultaneous processes per output node"), 'nthreads': ingredient.IntField('--nthreads', default=2, help="Number of threads per (N)DPPP process"), 'clobber': ingredient.BoolField( '--clobber', default=False, help="If ``True``, pre-existing output files will be removed " "before processing starts. If ``False``, the pipeline will " "abort if files already exist with the appropriate output " "filenames") # Keys that are present in the original demixing recipe. # Don't know yet if we still need them. # 'timestep': ingredient.IntField( # '--timestep', # help="Time step for averaging", # default=10 # ), # 'freqstep': ingredient.IntField( # '--freqstep', # help="Frequency step for averaging", # default=60 # ), # 'half_window': ingredient.IntField( # '--half-window', # help="Window size of median filter", # default=20 # ), # 'threshold': ingredient.FloatField( # '--threshold', # help="Solutions above/below threshold*rms are smoothed", # default=2.5 # ), } outputs = { 'mapfile': ingredient.FileField( help="The full path to a mapfile describing the processed data" # ), # 'fullyflagged': ingredient.ListField( # help="A list of all baselines which were completely flagged in any " # "of the input MeasurementSets" ) } def go(self): self.logger.info("Starting DPPP run") super(dppp, self).go() # # Keep track of "Total flagged" messages in the DPPP logs # # ---------------------------------------------------------------------- # self.logger.searchpatterns["fullyflagged"] = "Fully flagged baselines" # ********************************************************************* # 1. load input data file, validate output vs the input location if # output locations are provided args = self.inputs['args'] self.logger.debug("Loading input-data mapfile: %s" % args[0]) indata = DataMap.load(args[0]) if len(args) > 1: self.logger.debug("Loading output-data mapfile: %s" % args[1]) outdata = DataMap.load(args[1]) else: outdata = copy.deepcopy(indata) for item in outdata: item.file = os.path.join( self.inputs['working_directory'], self.inputs['job_name'], os.path.basename(item.file) + self.inputs['suffix']) # ******************************************************************** # 2. Load parmdb and sourcedb # Load parmdb-mapfile, if one was given. if self.inputs.has_key('parmdb_mapfile'): self.logger.debug("Loading parmdb mapfile: %s" % self.inputs['parmdb_mapfile']) parmdbdata = DataMap.load(self.inputs['parmdb_mapfile']) else: parmdbdata = copy.deepcopy(indata) for item in parmdbdata: item.file = '' # Load sourcedb-mapfile, if one was given. if self.inputs.has_key('sourcedb_mapfile'): self.logger.debug("Loading sourcedb mapfile: %s" % self.inputs['sourcedb_mapfile']) sourcedbdata = DataMap.load(self.inputs['sourcedb_mapfile']) else: sourcedbdata = copy.deepcopy(indata) for item in sourcedbdata: item.file = '' # Validate all the data maps. if not validate_data_maps(indata, outdata, parmdbdata, sourcedbdata): self.logger.error("Validation of data mapfiles failed!") return 1 # Update the skip fields of the four maps. If 'skip' is True in any of # these maps, then 'skip' must be set to True in all maps. for w, x, y, z in zip(indata, outdata, parmdbdata, sourcedbdata): w.skip = x.skip = y.skip = z.skip = (w.skip or x.skip or y.skip or z.skip) # ******************************************************************** # 3. Call the node side of the recipe # Create and schedule the compute jobs command = "python %s" % (self.__file__.replace('master', 'nodes')) indata.iterator = outdata.iterator = DataMap.SkipIterator parmdbdata.iterator = sourcedbdata.iterator = DataMap.SkipIterator jobs = [] for inp, outp, pdb, sdb in zip(indata, outdata, parmdbdata, sourcedbdata): jobs.append( ComputeJob(inp.host, command, arguments=[ inp.file, outp.file, pdb.file, sdb.file, self.inputs['parset'], self.inputs['executable'], self.environment, self.inputs['demix_always'], self.inputs['demix_if_needed'], self.inputs['data_start_time'], self.inputs['data_end_time'], self.inputs['nthreads'], self.inputs['clobber'] ], resources={"cores": self.inputs['nthreads']})) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) for job, outp in zip(jobs, outdata): if job.results['returncode'] != 0: outp.skip = True # # ********************************************************************* # # 4. parse logfile for fully flagged baselines # matches = self.logger.searchpatterns["fullyflagged"].results # self.logger.searchpatterns.clear() # finished searching # stripchars = "".join(set("Fully flagged baselines: ")) # baselinecounter = defaultdict(lambda: 0) # for match in matches: # for pair in ( # pair.strip(stripchars) for pair in match.getMessage().split(";") # ): # baselinecounter[pair] += 1 # self.outputs['fullyflagged'] = baselinecounter.keys() # ********************************************************************* # 4. Check job results, and create output data map file if self.error.isSet(): # Abort if all jobs failed if all(job.results['returncode'] != 0 for job in jobs): self.logger.error("All jobs failed. Bailing out!") return 1 else: self.logger.warn( "Some jobs failed, continuing with succeeded runs") self.logger.debug("Writing data map file: %s" % self.inputs['mapfile']) outdata.save(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0
class vdsreader(BaseRecipe): """ Read a GVDS file and return a list of the MS filenames referenced therein together with selected metadata. This recipe performs it's functionality at the master side of the recipe: 1. Open the gvds file as a parameterset 2. Convert all part FileNames to mss 3. Parse start and end time and pointing information **no command line arguments:** """ inputs = { 'gvds': ingredient.FileField( '-g', '--gvds', help="GVDS file to process" ) } outputs = { 'data': ingredient.ListField(help="List of MeasurementSet paths"), 'start_time': ingredient.StringField(help="Start time of observation"), 'end_time': ingredient.StringField(help="End time of observation"), 'pointing': ingredient.DictField(help="Observation pointing direction") } def go(self): self.logger.info("Starting vdsreader run") super(vdsreader, self).go() # ********************************************************************* # 1. Open the gvds file as a parameterset try: gvds = parameterset(self.inputs['gvds']) except: self.logger.error("Unable to read G(V)DS file") raise self.logger.info("Building list of measurementsets") # ********************************************************************** # 2. convert al partx.FileName values to ms ms_names = [ gvds.getString("Part%d.FileName" % (part_no,)) for part_no in range(gvds.getInt("NParts")) ] self.logger.debug(ms_names) self.outputs['data'] = ms_names # **********************************************************************\ # 3. parse start and end time and pointing information try: self.outputs['start_time'] = gvds.getString('StartTime') self.outputs['end_time'] = gvds.getString('EndTime') except: self.logger.warn("Failed to read start/end time from GVDS file") try: self.outputs['pointing'] = { 'type': gvds.getStringVector('Extra.FieldDirectionType')[0], 'dec': gvds.getStringVector('Extra.FieldDirectionDec')[0], 'ra': gvds.getStringVector('Extra.FieldDirectionRa')[0] } except: self.logger.warn("Failed to read pointing information from GVDS file") return 0
class executable_args(BaseRecipe, RemoteCommandRecipeMixIn): """ Basic script for running an executable with arguments. Passing a mapfile along so the executable can process MS. """ inputs = { 'executable': ingredient.ExecField( '--executable', help="The full path to the relevant executable", optional=True ), 'arguments': ingredient.ListField( '-a', '--arguments', help="List of arguments for the executable. Will be added as ./exe arg0 arg1...", default='', optional=True ), 'nthreads': ingredient.IntField( '--nthreads', default=8, help="Number of threads per process" ), 'nodescript': ingredient.StringField( '--nodescript', help="Name of the node script to execute", default='executable_args', optional=True ), 'parset': ingredient.FileField( '-p', '--parset', help="Path to the arguments for this executable. Will be converted to --key=value", optional=True ), 'inputkey': ingredient.StringField( '-i', '--inputkey', help="Parset key that the executable will recognize as key for inputfile", default='', optional=True ), 'outputkey': ingredient.StringField( '-0', '--outputkey', help="Parset key that the executable will recognize as key for outputfile", default='', optional=True ), 'inputkeys': ingredient.ListField( '--inputkeys', help="List of parset keys that the executable will recognize as key for inputfile", default=[], optional=True ), 'outputkeys': ingredient.ListField( '--outputkeys', help="List of parset keys that the executable will recognize as key for outputfile", default=[], optional=True ), 'mapfiles_in': ingredient.ListField( '--mapfiles-in', help="List of the input mapfiles containing the names of the " "data to run the recipe on", default=[], optional=True ), 'mapfiles_as_string': ingredient.ListField( '--mapfiles_as_string', help="List of the input mapfiles to ignore and just use the name string instead.", default=[], optional=True ), 'mapfiles_out': ingredient.ListField( '--mapfiles-out', help="List of the output mapfiles containing the names of the " "data produced by the recipe", default=[], optional=True ), 'mapfile_in': ingredient.StringField( '--mapfile-in', help="Name of the input mapfile containing the names of the " "MS-files to run the recipe", default='', optional=True ), 'mapfile_out': ingredient.StringField( '--mapfile-out', help="Name of the output mapfile containing the names of the " "MS-files produced by the recipe", default='', optional=True ), 'skip_infile': ingredient.BoolField( '--skip-infile', help="Dont give the input file to the executable.", default=False, optional=True ), 'skip_outfile': ingredient.BoolField( '--skip-outfile', help="Dont produce an output file", default=False, optional=True ), 'inplace': ingredient.BoolField( '--inplace', help="Manipulate input files inplace", default=False, optional=True ), 'outputsuffixes': ingredient.ListField( '--outputsuffixes', help="Suffixes for the outputfiles", default=[] ), 'parsetasfile': ingredient.BoolField( '--parsetasfile', help="Will the argument be a parsetfile or --opt=var", default=False ), 'args_format': ingredient.StringField( '--args_format', help="Will change the format of the arguments. Standard definitions are...dont know yet", default='gnu' ), 'args_format_argument': ingredient.StringField( '--args_format_argument', help="Will change the format of the arguments without option fields.", default='' ), 'args_format_option': ingredient.StringField( '--args_format_option', help="Will change the format of option fields.", default='-' ), 'args_format_longoption': ingredient.StringField( '--args_format_longoption', help="Will change the format of long option fields. Typically '--'", default='--' ), 'args_format_option_argument': ingredient.StringField( '--args_format_option_argument', help="Will change the format of the arguments without option fields.", default='=' ), 'max_per_node': ingredient.IntField( '--max_per_node', help="Sets the number of jobs per node", default=0 ), 'stepname': ingredient.StringField( '--stepname', help="stepname for individual naming of results", optional=True ), 'environment': ingredient.DictField( '--environment', help="Update environment variables for this step.", optional=True ), 'error_tolerance': ingredient.BoolField( '--error_tolerance', help="Controls if the program exits on the first error or continues with succeeded MS.", default=True, optional=True ) } outputs = { 'mapfile': ingredient.FileField( help="The full path to a mapfile describing the processed data" ) } def go(self): if 'executable' in self.inputs: executable = self.inputs['executable'] if self.inputs['nthreads']: self.environment["OMP_NUM_THREADS"] = str(self.inputs['nthreads']) if 'environment' in self.inputs: self.environment.update(self.inputs['environment']) self.logger.info("Starting %s run" % executable) super(executable_args, self).go() # args format stuff args_format = {'args_format': self.inputs['args_format'], 'args_format_argument': self.inputs['args_format_argument'], 'args_format_option': self.inputs['args_format_option'], 'args_formatlongoption': self.inputs['args_format_longoption'], 'args_format_option_argument': self.inputs['args_format_option_argument']} mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") work_dir = os.path.join(self.inputs['working_directory'], self.inputs['job_name']) # ********************************************************************* # try loading input/output data file, validate output vs the input location if # output locations are provided try: inputmapfiles = [] inlist = [] if self.inputs['mapfile_in']: inlist.append(self.inputs['mapfile_in']) if self.inputs['mapfiles_in']: for item in self.inputs['mapfiles_in']: inlist.append(item) self.inputs['mapfile_in'] = self.inputs['mapfiles_in'][0] for item in inlist: inputmapfiles.append(DataMap.load(item)) except Exception: self.logger.error('Could not load input Mapfile %s' % inlist) return 1 outputmapfiles = [] if self.inputs['mapfile_out']: try: outdata = DataMap.load(self.inputs['mapfile_out']) outputmapfiles.append(outdata) except Exception: self.logger.error('Could not load output Mapfile %s' % self.inputs['mapfile_out']) return 1 # sync skip fields in the mapfiles align_data_maps(inputmapfiles[0], outputmapfiles[0]) elif self.inputs['mapfiles_out']: for item in self.inputs['mapfiles_out']: outputmapfiles.append(DataMap.load(item)) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] else: # ouput will be directed in the working directory if no output mapfile is specified outdata = copy.deepcopy(inputmapfiles[0]) if not self.inputs['inplace']: for item in outdata: item.file = os.path.join( self.inputs['working_directory'], self.inputs['job_name'], #os.path.basename(item.file) + '.' + os.path.split(str(executable))[1] os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] ) self.inputs['mapfile_out'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + 'mapfile') self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) else: self.inputs['mapfile_out'] = self.inputs['mapfile_in'] self.inputs['mapfiles_out'].append(self.inputs['mapfile_out']) outputmapfiles.append(outdata) if not validate_data_maps(inputmapfiles[0], outputmapfiles[0]): self.logger.error( "Validation of data mapfiles failed!" ) return 1 if self.inputs['outputsuffixes']: # Handle multiple outputfiles for name in self.inputs['outputsuffixes']: outputmapfiles.append(copy.deepcopy(inputmapfiles[0])) self.inputs['mapfiles_out'].append(os.path.join(mapfile_dir, self.inputs['stepname'] + name + '.' + 'mapfile')) for item in outputmapfiles[-1]: item.file = os.path.join( work_dir, os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] + name ) self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0] # prepare arguments arglist = self.inputs['arguments'] parsetdict = {} if 'parset' in self.inputs: parset = Parset() parset.adoptFile(self.inputs['parset']) for k in parset.keys: parsetdict[k] = str(parset[k]) # construct multiple input data if self.inputs['inputkey'] and not self.inputs['inputkey'] in self.inputs['inputkeys']: self.inputs['inputkeys'].insert(0, self.inputs['inputkey']) if not self.inputs['outputkeys'] and self.inputs['outputkey']: self.inputs['outputkeys'].append(self.inputs['outputkey']) if not self.inputs['skip_infile'] and len(self.inputs['inputkeys']) is not len(inputmapfiles): self.logger.error("Number of input mapfiles %d and input keys %d have to match." % (len(inputmapfiles), len(self.inputs['inputkeys']))) return 1 filedict = {} if self.inputs['inputkeys'] and not self.inputs['skip_infile']: for key, filemap, mapname in zip(self.inputs['inputkeys'], inputmapfiles, inlist): if not mapname in self.inputs['mapfiles_as_string']: filedict[key] = [] for inp in filemap: filedict[key].append(inp.file) else: if key != mapname: filedict[key] = [] for inp in filemap: filedict[key].append(mapname) if self.inputs['outputkey']: filedict[self.inputs['outputkey']] = [] for item in outputmapfiles[0]: filedict[self.inputs['outputkey']].append(item.file) # ******************************************************************** # Call the node side of the recipe # Create and schedule the compute jobs #command = "python3 %s" % (self.__file__.replace('master', 'nodes')).replace('executable_args', self.inputs['nodescript']) recipe_dir_str = str(self.config.get('DEFAULT', 'recipe_directories')) recipe_directories = recipe_dir_str.rstrip(']').lstrip('[').split(',') pylist = os.getenv('PYTHONPATH').split(':') command = None for pl in pylist: if os.path.isfile(os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py')): command = "python3 %s" % os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py') for pl in recipe_directories: if os.path.isfile(os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py')): command = "python3 %s" % os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py') inputmapfiles[0].iterator = outputmapfiles[0].iterator = DataMap.SkipIterator jobs = [] for i, (outp, inp,) in enumerate(zip( outputmapfiles[0], inputmapfiles[0]) ): arglist_copy = copy.deepcopy(arglist) parsetdict_copy = copy.deepcopy(parsetdict) if filedict: for name, value in filedict.items(): replaced = False if arglist_copy: for arg in arglist: if name == arg: ind = arglist_copy.index(arg) arglist_copy[ind] = arglist_copy[ind].replace(name, value[i]) replaced = True if parsetdict_copy: if name in list(parsetdict_copy.values()): for k, v in parsetdict_copy.items(): if v == name: parsetdict_copy[k] = value[i] else: if not replaced: parsetdict_copy[name] = value[i] jobs.append( ComputeJob( inp.host, command, arguments=[ inp.file, executable, arglist_copy, parsetdict_copy, work_dir, self.inputs['parsetasfile'], args_format, self.environment ], resources={ "cores": self.inputs['nthreads'] } ) ) max_per_node = self.inputs['max_per_node'] self._schedule_jobs(jobs, max_per_node) jobresultdict = {} resultmap = {} for job, outp in zip(jobs, outputmapfiles[0]): if job.results['returncode'] != 0: outp.skip = True if not self.inputs['error_tolerance']: self.logger.error("A job has failed with returncode %d and error_tolerance is not set. Bailing out!" % job.results['returncode']) return 1 for k, v in list(job.results.items()): if not k in jobresultdict: jobresultdict[k] = [] jobresultdict[k].append(DataProduct(job.host, job.results[k], outp.skip)) if k == 'break': self.outputs.update({'break': v}) # temp solution. write all output dict entries to a mapfile #mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles") #check directory for stand alone mode if not os.path.isdir(mapfile_dir): try: os.mkdir(mapfile_dir, ) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(mapfile_dir): pass else: raise for k, v in list(jobresultdict.items()): dmap = DataMap(v) dmap.save(os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile')) resultmap[k + '.mapfile'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile') self.outputs.update(resultmap) # ********************************************************************* # Check job results, and create output data map file if self.error.isSet(): # Abort if all jobs failed if all(job.results['returncode'] != 0 for job in jobs): self.logger.error("All jobs failed. Bailing out!") return 1 else: self.logger.warn( "Some jobs failed, continuing with succeeded runs" ) mapdict = {} for item, name in zip(outputmapfiles, self.inputs['mapfiles_out']): self.logger.debug("Writing data map file: %s" % name) item.save(name) mapdict[os.path.basename(name)] = name self.outputs['mapfile'] = self.inputs['mapfile_out'] if self.inputs['outputsuffixes']: self.outputs.update(mapdict) return 0
class demixing(BaseRecipe, RemoteCommandRecipeMixIn): """ Run the demixer on the MS's on the compute nodes. """ inputs = { 'mapfile': ingredient.StringField( '--mapfile', help="Name of the output mapfile containing the names of the " "MS-files produced by the demixing recipe"), 'working_directory': ingredient.StringField('-w', '--working-directory', help="Working directory used on output nodes. " "Results will be written here"), 'initscript': ingredient.FileField( '--initscript', help="The full path to an (Bourne) shell script which will " "intialise the environment (ie, ``lofarinit.sh``)"), 'demix_parset_dir': ingredient.DirectoryField( '--demix-parset-dir', dest='demixdir', help="Directory containing the demixing parset-files", ), 'db_host': ingredient.StringField( '--db-host', dest="db_host", help="Database host with optional port (e.g. ldb001)"), 'skymodel': ingredient.FileField( '--skymodel', help="File containing the sky model to use", ), 'demix_sources': ingredient.ListField( '--demix-sources', dest='remove', help="List of sources to remove e.g. 'CygA, CasA'; " "will be determined automatically if not specified.", default=[]), 'ms_target': ingredient.StringField( '--ms-target', dest='target', help="Substring in the output MS name that replaces the " "substring 'uv' (default: 'target')", default="target"), 'timestep': ingredient.IntField('--timestep', help="Time step for averaging", default=10), 'freqstep': ingredient.IntField('--freqstep', help="Frequency step for averaging", default=60), 'half_window': ingredient.IntField('--half-window', help="Window size of median filter", default=20), 'threshold': ingredient.FloatField( '--threshold', help="Solutions above/below threshold*rms are smoothed", default=2.5), 'nproc': ingredient.IntField( '--nproc', help="Maximum number of simultaneous processes per compute node", default=1) } outputs = {'mapfile': ingredient.FileField()} def go(self): self.logger.info("Starting demixing run") super(demixing, self).go() job_dir = os.path.join(self.inputs['working_directory'], self.inputs['job_name']) # Load file <-> compute node mapping from disk # ------------------------------------------------------------------ args = self.inputs['args'] self.logger.debug("Loading input-data mapfile: %s" % args[0]) indata = load_data_map(args[0]) if len(args) > 1: self.logger.debug("Loading output-data mapfile: %s" % args[1]) outdata = load_data_map(args[1]) if not validate_data_maps(indata, outdata): self.logger.error( "Validation of input/output data mapfiles failed") return 1 else: # This is a bit of a kludge. The input MS-filenames are supposed to # contain the string "_uv". The demixing node script will produce # output MS-files, whose names have the string "_uv" replaced by # "_" + self.inputs['ms_target'] + "_sub". outdata = [(host, os.path.join( job_dir, os.path.basename(infile).replace( '_uv', '_' + self.inputs['ms_target'] + '_sub'))) for host, infile in indata] command = "python %s" % (self.__file__.replace('master', 'nodes')) jobs = [] for host, infile in indata: jobs.append( ComputeJob( host, command, arguments=[ infile, job_dir, self.inputs['initscript'], self.inputs['demix_sources'], self.inputs['ms_target'], self.config.get('cluster', 'clusterdesc'), self.inputs['timestep'], self.inputs['freqstep'], self.inputs['half_window'], self.inputs['threshold'], self.inputs['demix_parset_dir'], self.inputs['skymodel'], self.inputs['db_host'] ])) self._schedule_jobs(jobs, max_per_node=self.inputs['nproc']) if self.error.isSet(): return 1 else: self.logger.debug("Writing mapfile %s" % self.inputs['mapfile']) store_data_map(self.inputs['mapfile'], outdata) self.outputs['mapfile'] = self.inputs['mapfile'] return 0