Ejemplo n.º 1
0
class thumbnail_combine(BaseRecipe, RemoteCommandRecipeMixIn):
    inputs = {
        'executable':
        ingredient.ExecField('--executable',
                             default="/usr/bin/montage",
                             help="montage executable"),
        'file_pattern':
        ingredient.StringField(
            '--file-pattern',
            default="*.th.png",
            help="File search pattern (glob)",
        ),
        'input_dir':
        ingredient.StringField('--input-dir',
                               help="Directory containing input files"),
        'output_file':
        ingredient.StringField('--output-file', help="Output filename"),
        'clobber':
        ingredient.BoolField('--clobber',
                             default=False,
                             help="Clobber pre-existing output files"),
        'target_hosts':
        ingredient.ListField('--target-hosts',
                             help="Remote hosts on which to execute")
    }

    def go(self):
        self.logger.info("Starting thumbnail_combine run")
        super(thumbnail_combine, self).go()

        hosts = self.inputs['target_hosts']
        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        jobs = []
        for host in hosts:
            jobs.append(
                ComputeJob(host,
                           command,
                           arguments=[
                               self.inputs['executable'],
                               self.inputs['file_pattern'],
                               self.inputs['input_dir'],
                               self.inputs['output_file'],
                               self.inputs['clobber']
                           ]))
        self._schedule_jobs(jobs)

        if self.error.isSet():
            self.logger.warn("Failed compute job process detected")
            return 1
        else:
            return 0
Ejemplo n.º 2
0
class copier(MasterNodeInterface):
    """
    The copier recipe is used to copy paths provided in the source mapfile
    to the same node as 'matched' list provided in the target mapfile.
    The primairy use is to collect data on computation nodes, which nodes is
    sometime only specified in the mapfiles.
    There are Two operations performed by this script
    1. COPY the source path to the parent directory of
    the path provided in the target mapfilem eg: Copy instrument tables
    next to the measurement sets on which they will applied.
    To Use this operation set target_dir to "" or do not specify it
    2. COLLECT information from source nodes to a central path on different
    nodes specified in the target mapfile. eg Copy instrument tables from
    the node they are produces to the same node as the measurement sets
    privided in the target file BUT place them all in the same dir. Provide
    a target_dir for this operation: all paths not starting with /, not
    absolute     will be placed in a dir with this name relative to the
    working dir.

    **Arguments**

    A mapfile describing the data to be processed.
    """
    inputs = {
        'mapfile_source': ingredient.StringField(
            '--mapfile-source',
            help = "Full path of mapfile of node:path pairs of source dataset"
        ),
        'mapfile_target': ingredient.StringField(
            '--mapfile-target',
            help = "Full path of mapfile of node:path pairs of target location"
        ),
        'allow_rename': ingredient.BoolField(
            '--allow-rename',
            default = True,
            help = "Allow renaming of basename at target location"
        ),
        'allow_move': ingredient.BoolField(
            '--allow-move',
            default = True,
            help = "Allow moving files instead of copying them"
        ),
        'mapfiles_dir': ingredient.StringField(
            '--mapfiles-dir',
            help = "Path of directory, shared by all nodes, which will be used"
                " to write mapfile for master-node communication, "
        ),
        'mapfile': ingredient.StringField(
            '--mapfile',
            help = "full path to mapfile containing copied paths"
        ),
    }

    outputs = {
        'mapfile_target_copied': ingredient.StringField(
            help = "Path to mapfile containing all the succesfull copied"
            "target files")
    }

    def __init__(self):
        """
        Constructor sets the python command used to call node scripts
        """
        super(copier, self).__init__(
            "python3 {0}".format(self.__file__.replace('master', 'nodes')))
        self.source_map = DataMap()
        self.target_map = DataMap()

    def _validate_mapfiles(self, allow_rename = False):
        """
        Validation of input source and target map files. They must have equal
        length. Furthermore, if rename is not allowed, test that 'file names'
        are the same.
        """
        # Same length? If not, then fail
        if len(self.source_map) != len(self.target_map):
            self.logger.error("Number of entries in the source and target map"
                " is not the same: \n target \n {0}\n source \n {1}".format(
                            self.target_map, self.source_map))
            return False

        for source, target in zip(self.source_map, self.target_map):
            # skip strict checking of basename equality if rename is allowed
            if not allow_rename:
                target_name = os.path.basename(target.file)
                source_name = os.path.basename(source.file)
                if not (target_name == source_name):
                    self.logger.error("One of the supplied source target pairs"
                        " contains a different 'filename': {0} != {1}\n"
                        " aborting".format(target_name, source_name))
                    return False

        return True

    def _write_mapfile(self):
        """
        Write an (updated) mapfile.
        """
        self.logger.debug("Writing mapfile: %s" % self.inputs['mapfile'])
        self.target_map.save(self.inputs['mapfile'])
        self.outputs['mapfile_target_copied'] = self.inputs['mapfile']

    def on_failure(self):
        """
        All copier jobs failed. Bailing out.
        """
        self.logger.error("All copier jobs failed. Bailing out!")
        return 1

    def on_error(self):
        """
        Some copier jobs failed. Update the target map, setting 'skip' to True
        for failed runs, and save it.
        """
        self.logger.warn(
            "Some copier jobs failed, continuing with succeeded runs"
        )
        for job, target in zip(self._jobs, self.target_map):
            if job.results['returncode'] != 0:
                target.skip = True
        self._write_mapfile()
        return 0

    def on_succes(self):
        """
        All copier jobs succeeded. Save an updated mapfile.
        """
        self.logger.info("All copier jobs succeeded")
        self._write_mapfile()
        return 0

    def go(self):
        # TODO: Remove dependency on mapfile_dir
        self.logger.info("Starting copier run")
        super(copier, self).go()

        globalfs = self.config.has_option("remote", "globalfs") and self.config.getboolean("remote", "globalfs")

        # Load data from mapfiles
        self.source_map = DataMap.load(self.inputs['mapfile_source'])
        self.target_map = DataMap.load(self.inputs['mapfile_target'])

        # validate data in mapfiles
        if not self._validate_mapfiles(self.inputs['allow_rename']):
            return 1

        # Run the compute nodes with the node specific mapfiles
        for source, target in zip(self.source_map, self.target_map):
            args = [source.host, source.file, target.file, globalfs, self.inputs['allow_move']]
            self.append_job(target.host, args)

        # start the jobs, return the exit status.
        return self.run_jobs()
Ejemplo n.º 3
0
Archivo: dppp.py Proyecto: mfkiwl/LOFAR
class dppp(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Runs ``NDPPP`` on a number of MeasurementSets. This is used for averaging,
    and/or flagging, and/or demixing of data.

    1. Load input data files
    2. Load parmdb and sourcedb
    3. Call the node side of the recipe
    4. Create mapfile with successful noderecipe runs

    **Command line arguments**

    1. A mapfile describing the data to be processed.
    2. Optionally, a mapfile with target output locations.

    """
    inputs = {
        'parset':
        ingredient.FileField(
            '-p',
            '--parset',
            help="The full path to a DPPP configuration parset. The ``msin`` "
            "and ``msout`` keys will be added by this recipe"),
        'executable':
        ingredient.ExecField(
            '--executable',
            help="The full path to the relevant DPPP executable"),
        'suffix':
        ingredient.StringField(
            '--suffix',
            default=".dppp",
            help="Added to the input filename to generate the output filename"
        ),
        'working_directory':
        ingredient.StringField(
            '-w',
            '--working-directory',
            help="Working directory used on output nodes. Results will be "
            "written here"),
        'mapfile':
        ingredient.StringField(
            '--mapfile',
            help="Name of the output mapfile containing the names of the "
            "MS-files produced by the DPPP recipe"),
        'parmdb_mapfile':
        ingredient.StringField(
            '--parmdb-mapfile',
            optional=True,
            help="Path to mapfile containing the parmdb files "
            "(used by demixing step only)"),
        'sourcedb_mapfile':
        ingredient.StringField(
            '--sourcedb-mapfile',
            optional=True,
            help="Path to mapfile containing the sourcedb files "
            "(used by demixing step only)"),
        'demix_always':
        ingredient.ListField(
            '--demix-always',
            help="List of sources that must always be demixed "
            "(used by demixing step only)",
            default=[]),
        'demix_if_needed':
        ingredient.ListField(
            '--demix-if-needed',
            help="List of sources that will only be demixed if needed, "
            "based on some heuristics (used by demixing step only)",
            default=[]),
        # NB times are read from vds file as string
        'data_start_time':
        ingredient.StringField(
            '--data-start-time',
            default="",
            help="Start time to be passed to DPPP; used to pad data"),
        'data_end_time':
        ingredient.StringField(
            '--data-end-time',
            default="",
            help="End time to be passed to DPPP; used to pad data"),
        'nproc':
        ingredient.IntField(
            '--nproc',
            default=8,
            help="Maximum number of simultaneous processes per output node"),
        'nthreads':
        ingredient.IntField('--nthreads',
                            default=2,
                            help="Number of threads per (N)DPPP process"),
        'clobber':
        ingredient.BoolField(
            '--clobber',
            default=False,
            help="If ``True``, pre-existing output files will be removed "
            "before processing starts. If ``False``, the pipeline will "
            "abort if files already exist with the appropriate output "
            "filenames")
        # Keys that are present in the original demixing recipe.
        # Don't know yet if we still need them.
        #        'timestep': ingredient.IntField(
        #            '--timestep',
        #            help="Time step for averaging",
        #            default=10
        #        ),
        #        'freqstep': ingredient.IntField(
        #            '--freqstep',
        #            help="Frequency step for averaging",
        #            default=60
        #        ),
        #        'half_window': ingredient.IntField(
        #            '--half-window',
        #            help="Window size of median filter",
        #            default=20
        #        ),
        #        'threshold': ingredient.FloatField(
        #            '--threshold',
        #            help="Solutions above/below threshold*rms are smoothed",
        #            default=2.5
        #        ),
    }

    outputs = {
        'mapfile':
        ingredient.FileField(
            help="The full path to a mapfile describing the processed data"
            #        ),
            #        'fullyflagged': ingredient.ListField(
            #            help="A list of all baselines which were completely flagged in any "
            #                 "of the input MeasurementSets"
        )
    }

    def go(self):
        self.logger.info("Starting DPPP run")
        super(dppp, self).go()

        #        #                Keep track of "Total flagged" messages in the DPPP logs
        #        # ----------------------------------------------------------------------
        #        self.logger.searchpatterns["fullyflagged"] = "Fully flagged baselines"

        # *********************************************************************
        # 1. load input data file, validate output vs the input location if
        #    output locations are provided
        args = self.inputs['args']
        self.logger.debug("Loading input-data mapfile: %s" % args[0])
        indata = DataMap.load(args[0])
        if len(args) > 1:
            self.logger.debug("Loading output-data mapfile: %s" % args[1])
            outdata = DataMap.load(args[1])
        else:
            outdata = copy.deepcopy(indata)
            for item in outdata:
                item.file = os.path.join(
                    self.inputs['working_directory'], self.inputs['job_name'],
                    os.path.basename(item.file) + self.inputs['suffix'])

        # ********************************************************************
        # 2. Load parmdb and sourcedb
        # Load parmdb-mapfile, if one was given.
        if self.inputs.has_key('parmdb_mapfile'):
            self.logger.debug("Loading parmdb mapfile: %s" %
                              self.inputs['parmdb_mapfile'])
            parmdbdata = DataMap.load(self.inputs['parmdb_mapfile'])
        else:
            parmdbdata = copy.deepcopy(indata)
            for item in parmdbdata:
                item.file = ''

        # Load sourcedb-mapfile, if one was given.
        if self.inputs.has_key('sourcedb_mapfile'):
            self.logger.debug("Loading sourcedb mapfile: %s" %
                              self.inputs['sourcedb_mapfile'])
            sourcedbdata = DataMap.load(self.inputs['sourcedb_mapfile'])
        else:
            sourcedbdata = copy.deepcopy(indata)
            for item in sourcedbdata:
                item.file = ''

        # Validate all the data maps.
        if not validate_data_maps(indata, outdata, parmdbdata, sourcedbdata):
            self.logger.error("Validation of data mapfiles failed!")
            return 1

        # Update the skip fields of the four maps. If 'skip' is True in any of
        # these maps, then 'skip' must be set to True in all maps.
        for w, x, y, z in zip(indata, outdata, parmdbdata, sourcedbdata):
            w.skip = x.skip = y.skip = z.skip = (w.skip or x.skip or y.skip
                                                 or z.skip)

        # ********************************************************************
        # 3. Call the node side of the recipe
        # Create and schedule the compute jobs
        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        indata.iterator = outdata.iterator = DataMap.SkipIterator
        parmdbdata.iterator = sourcedbdata.iterator = DataMap.SkipIterator
        jobs = []
        for inp, outp, pdb, sdb in zip(indata, outdata, parmdbdata,
                                       sourcedbdata):
            jobs.append(
                ComputeJob(inp.host,
                           command,
                           arguments=[
                               inp.file, outp.file, pdb.file, sdb.file,
                               self.inputs['parset'],
                               self.inputs['executable'], self.environment,
                               self.inputs['demix_always'],
                               self.inputs['demix_if_needed'],
                               self.inputs['data_start_time'],
                               self.inputs['data_end_time'],
                               self.inputs['nthreads'], self.inputs['clobber']
                           ],
                           resources={"cores": self.inputs['nthreads']}))
        self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])
        for job, outp in zip(jobs, outdata):
            if job.results['returncode'] != 0:
                outp.skip = True


#        # *********************************************************************
#        # 4. parse logfile for fully flagged baselines
#        matches = self.logger.searchpatterns["fullyflagged"].results
#        self.logger.searchpatterns.clear() # finished searching
#        stripchars = "".join(set("Fully flagged baselines: "))
#        baselinecounter = defaultdict(lambda: 0)
#        for match in matches:
#            for pair in (
#                pair.strip(stripchars) for pair in match.getMessage().split(";")
#            ):
#                baselinecounter[pair] += 1
#        self.outputs['fullyflagged'] = baselinecounter.keys()

# *********************************************************************
# 4. Check job results, and create output data map file
        if self.error.isSet():
            # Abort if all jobs failed
            if all(job.results['returncode'] != 0 for job in jobs):
                self.logger.error("All jobs failed. Bailing out!")
                return 1
            else:
                self.logger.warn(
                    "Some jobs failed, continuing with succeeded runs")
        self.logger.debug("Writing data map file: %s" % self.inputs['mapfile'])
        outdata.save(self.inputs['mapfile'])
        self.outputs['mapfile'] = self.inputs['mapfile']
        return 0
Ejemplo n.º 4
0
class vdsmaker(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Generate a GVDS file (and, optionally, individual VDS files per subband;
    see the ``unlink`` input parameter) describing a collection of
    MeasurementSets.

    1. Load data from disk, create the output vds paths
    2. Call the vdsmaker node script to generate the vds files
    3. Combine the vds files in a gvds file (master side operation)
    
    **Command line arguments**

    A mapfile describing the measurementsets to be processed.
    """
    inputs = {
        'gvds':
        ingredient.StringField('-g',
                               '--gvds',
                               help="File name for output GVDS file"),
        'directory':
        ingredient.DirectoryField('--directory',
                                  help="Directory for output GVDS file"),
        'makevds':
        ingredient.ExecField('--makevds',
                             help="Full path to makevds executable"),
        'combinevds':
        ingredient.ExecField('--combinevds',
                             help="Full path to combinevds executable"),
        'unlink':
        ingredient.BoolField('--unlink',
                             help="Unlink VDS files after combining",
                             default=True),
        'nproc':
        ingredient.IntField(
            '--nproc',
            help="Maximum number of simultaneous processes per compute node",
            default=8)
    }

    outputs = {'gvds': ingredient.FileField()}

    def go(self):
        """
        Contains functionality of the vdsmaker
        """
        super(vdsmaker, self).go()
        # **********************************************************************
        # 1. Load data from disk create output files
        args = self.inputs['args']
        self.logger.debug("Loading input-data mapfile: %s" % args[0])
        data = DataMap.load(args[0])

        # Skip items in `data` that have 'skip' set to True
        data.iterator = DataMap.SkipIterator

        # Create output vds names
        vdsnames = [
            os.path.join(self.inputs['directory'],
                         os.path.basename(item.file) + '.vds') for item in data
        ]

        # *********************************************************************
        # 2. Call vdsmaker
        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        jobs = []
        for inp, vdsfile in zip(data, vdsnames):
            jobs.append(
                ComputeJob(inp.host,
                           command,
                           arguments=[
                               inp.file,
                               self.config.get('cluster', 'clusterdesc'),
                               vdsfile, self.inputs['makevds']
                           ]))
        self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])
        vdsnames = [
            vds for vds, job in zip(vdsnames, jobs)
            if job.results['returncode'] == 0
        ]
        if not vdsnames:
            self.logger.error("All makevds processes failed. Bailing out!")
            return 1

        # *********************************************************************
        # 3. Combine VDS files to produce GDS
        failure = False
        self.logger.info("Combining VDS files")
        executable = self.inputs['combinevds']
        gvds_out = self.inputs['gvds']
        # Create the gvds directory for output files, needed for combine
        create_directory(os.path.dirname(gvds_out))

        try:
            command = [executable, gvds_out] + vdsnames
            combineproc = subprocess.Popen(command,
                                           close_fds=True,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            sout, serr = combineproc.communicate()
            log_process_output(executable, sout, serr, self.logger)
            if combineproc.returncode != 0:
                raise subprocess.CalledProcessError(combineproc.returncode,
                                                    command)
            self.outputs['gvds'] = gvds_out
            self.logger.info("Wrote combined VDS file: %s" % gvds_out)
        except subprocess.CalledProcessError, cpe:
            self.logger.exception("combinevds failed with status %d: %s" %
                                  (cpe.returncode, serr))
            failure = True
        except OSError, err:
            self.logger.error("Failed to spawn combinevds (%s)" % str(err))
            failure = True
Ejemplo n.º 5
0
class selfcal_awimager(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Master script for the awimager. Collects arguments from command line and
    pipeline inputs.
    
    1. Load mapfiles and validate these
    2. Run the awimage node scripts
    3. Retrieve output. Construct output map file succesfull runs
    
    Details regarding the implementation of the imaging step can be found in 
    the node recipe 
    **CommandLine Arguments**
    
    A mapfile containing (node, datafile) pairs. The measurements set use as
    input for awimager executable  
 
    """
    inputs = {
        'executable': ingredient.ExecField(
            '--executable',
            help = "The full path to the  awimager executable"
        ),
        'parset': ingredient.FileField(
            '-p', '--parset',
            help = "The full path to a awimager configuration parset."
        ),
        'working_directory': ingredient.StringField(
            '-w', '--working-directory',
            help = "Working directory used on output nodes. Results location"
        ),
        'output_image': ingredient.StringField(
            '--output-image',
            help = "Path of the image to be create by the awimager"
        ),
        'mapfile': ingredient.StringField(
            '--mapfile',
            help = "Full path for output mapfile. A list of the"
                 "successfully generated images will be written here"
        ),
        'sourcedb_path': ingredient.StringField(
            '--sourcedb-path',
            help = "Full path of sourcedb used to create a mask for known sources"
        ),
        'mask_patch_size': ingredient.FloatField(
            '--mask-patch-size',
            help = "Scale factor for patches in the awimager mask"
        ),
        'autogenerate_parameters': ingredient.BoolField(
            '--autogenerate-parameters',
            default = True,
            help = "Turns on the autogeneration of: cellsize, image-size, fov."
            " MSSS 'type' functionality"
        ),
        'specify_fov': ingredient.BoolField(
            '--specify-fov',
            default = False,
            help = "calculated Image parameters are relative to fov, parameter"
            " is active when autogenerate_parameters is False"
        ),
        'fov': ingredient.FloatField(
            '--fov',
            default = 0.0,
            help = "calculated Image parameters are relative to this"
            " Field Of View in arcSec. This parameter is obligatory when"
            " specify_fov is True"
        ),
        'major_cycle': ingredient.IntField(
            '--major_cycle',
            help = "The number of the current cycle to modify the parset."
        ),
        'nr_cycles': ingredient.IntField(
            '--nr-cycles',
            help = "The number major cycles."
        ) ,
        'perform_self_cal': ingredient.BoolField(
            '--perform-self-cal',
            default=False,          
            help = "Control the usage of the self callibartion functionality"
        )
    }

    outputs = {
        'mapfile': ingredient.StringField(),
    }

    def go(self):
        """
        This member contains all the functionality of the imager_awimager.
        Functionality is all located at the node side of the script.
        """
        super(selfcal_awimager, self).go()
        self.logger.info("Starting imager_awimager run")

        # *********************************************************************
        # 1. collect the inputs and validate
        input_map = DataMap.load(self.inputs['args'][0])
        sourcedb_map = DataMap.load(self.inputs['sourcedb_path'])

        if not validate_data_maps(input_map, sourcedb_map):
            self.logger.error(
                        "the supplied input_ms mapfile and sourcedb mapfile"
                        "are incorrect. Aborting")
            self.logger.error(repr(input_map))
            self.logger.error(repr(sourcedb_map))
            return 1

        # *********************************************************************
        # 2. Start the node side of the awimager recipe
        # Compile the command to be executed on the remote machine
        node_command = "python3 %s" % (self.__file__.replace("master", "nodes"))
        jobs = []

        output_map = copy.deepcopy(input_map)        
        align_data_maps(input_map, output_map, sourcedb_map)

        sourcedb_map.iterator = input_map.iterator = output_map.iterator = \
            DataMap.SkipIterator

        for measurement_item, source_item in zip(input_map, sourcedb_map):
            if measurement_item.skip or source_item.skip:
                jobs.append(None)
                continue
            # both the sourcedb and the measurement are in a map
            # unpack both
            host , measurement_path = measurement_item.host, measurement_item.file
            host2 , sourcedb_path = source_item.host, source_item.file

            # construct and save the output name
            arguments = [self.inputs['executable'],
                         self.environment,
                         self.inputs['parset'],
                         self.inputs['working_directory'],
                         self.inputs['output_image'],
                         measurement_path,
                         sourcedb_path,
                         self.inputs['mask_patch_size'],
                         self.inputs['autogenerate_parameters'],
                         self.inputs['specify_fov'],
                         self.inputs['fov'],
                         self.inputs['major_cycle'],
                         self.inputs['nr_cycles'],
                         self.inputs['perform_self_cal']
                         ]

            jobs.append(ComputeJob(host, node_command, arguments))
        self._schedule_jobs(jobs)

        # *********************************************************************
        # 3. Check output of the node scripts

        for job, output_item in  zip(jobs, output_map):
            # job ==  None on skipped job
            if not "image" in job.results:
                output_item.file = "failed"
                output_item.skip = True

            else:
                output_item.file = job.results["image"]
                output_item.skip = False

        # Check if there are finished runs
        succesfull_runs = None
        for item in output_map:
            if item.skip == False:
                succesfull_runs = True
                break

        if not succesfull_runs:
            self.logger.error(
                    "None of the started awimager run finished correct")
            self.logger.error(
                    "No work left to be done: exiting with error status")
            return 1

        # If partial succes
        if self.error.isSet():
            self.logger.error("Failed awimager node run detected. continue with"
                              "successful tasks.")

        self._store_data_map(self.inputs['mapfile'], output_map,
                             "mapfile containing produces awimages")

        self.outputs["mapfile"] = self.inputs['mapfile']
        return 0
Ejemplo n.º 6
0
class imager_prepare(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Prepare phase master:

    1. Validate input
    2. Create mapfiles with input for work to be perform on the individual nodes
       based on the structured input mapfile. The input mapfile contains a list 
       of measurement sets. 
       Each node computes a single subband group but needs this for all
       timeslices. 
    3. Call the node scripts with correct input
    4. validate performance
       Only output the measurement nodes that finished succesfull

    **Command Line arguments:**

    The only command line argument is the a to a mapfile containing "all"
    the measurement sets needed for creating the sky images. First ordered on 
    timeslice then on subband group and finaly on index in the frequency
    range.

    **Arguments:**
    """

    inputs = {
        'ndppp_exec':
        ingredient.ExecField('--ndppp-exec',
                             help="The full path to the ndppp executable"),
        'parset':
        ingredient.FileField('-p',
                             '--parset',
                             help="The full path to a prepare parset"),
        'working_directory':
        ingredient.StringField(
            '-w',
            '--working-directory',
            help="Working directory used by the nodes: local data"),
        'nthreads':
        ingredient.IntField('--nthreads',
                            default=8,
                            help="Number of threads per process"),
        'target_mapfile':
        ingredient.StringField(
            '--target-mapfile',
            help="Contains the node and path to target files, defines"
            " the number of nodes the script will start on."),
        'slices_per_image':
        ingredient.IntField(
            '--slices-per-image',
            help="The number of (time) slices for each output image"),
        'subbands_per_image':
        ingredient.IntField(
            '--subbands-per-image',
            help="The number of subbands to be collected in each output image"
        ),
        'asciistat_executable':
        ingredient.ExecField('--asciistat-executable',
                             help="full path to the ascii stat executable"),
        'statplot_executable':
        ingredient.ExecField('--statplot-executable',
                             help="The full path to the statplot executable"),
        'msselect_executable':
        ingredient.ExecField('--msselect-executable',
                             help="The full path to the msselect executable "),
        'rficonsole_executable':
        ingredient.ExecField(
            '--rficonsole-executable',
            help="The full path to the rficonsole executable "),
        'do_rficonsole':
        ingredient.BoolField(
            '--do_rficonsole',
            default=True,
            help="toggle the rficonsole step in preprocessing (default True)"),
        'mapfile':
        ingredient.StringField(
            '--mapfile',
            help="Full path of mapfile; contains a list of the "
            "successfully generated and concatenated sub-band groups"),
        'slices_mapfile':
        ingredient.StringField(
            '--slices-mapfile',
            help="Path to mapfile containing the produced subband groups"),
        'ms_per_image_mapfile':
        ingredient.StringField(
            '--ms-per-image-mapfile',
            help="Path to mapfile containing the ms for each produced"
            "image"),
        'processed_ms_dir':
        ingredient.StringField(
            '--processed-ms-dir',
            help="Path to directory for processed measurment sets"),
        'add_beam_tables':
        ingredient.BoolField('--add_beam_tables',
                             default=False,
                             help="Developer option, adds beamtables to ms")
    }

    outputs = {
        'mapfile':
        ingredient.FileField(
            help="path to a mapfile Which contains a list of the"
            "successfully generated and concatenated measurement set"),
        'slices_mapfile':
        ingredient.FileField(
            help="Path to mapfile containing the produced subband groups"),
        'ms_per_image_mapfile':
        ingredient.FileField(
            help="Path to mapfile containing the used ms for each produced"
            "image")
    }

    def go(self):
        """
        Entry point for recipe: Called by the pipeline framework
        """
        super(imager_prepare, self).go()
        self.logger.info("Starting imager_prepare run")
        job_directory = self.config.get("layout", "job_directory")
        # *********************************************************************
        # input data
        input_map = DataMap.load(self.inputs['args'][0])
        output_map = DataMap.load(self.inputs['target_mapfile'])
        slices_per_image = self.inputs['slices_per_image']
        subbands_per_image = self.inputs['subbands_per_image']
        # Validate input
        if not self._validate_input_map(input_map, output_map,
                                        slices_per_image, subbands_per_image):
            return 1

        # outputs
        output_ms_mapfile_path = self.inputs['mapfile']

        # *********************************************************************
        # schedule the actual work
        # TODO: Refactor this function into: load data, perform work,
        # create output
        node_command = " python %s" % (self.__file__.replace(
            "master", "nodes"))

        jobs = []
        paths_to_image_mapfiles = []
        n_subband_groups = len(output_map)  # needed for subsets in sb list

        globalfs = self.config.has_option(
            "remote", "globalfs") and self.config.getboolean(
                "remote", "globalfs")

        for idx_sb_group, item in enumerate(output_map):
            #create the input files for this node
            self.logger.debug("Creating input data subset for processing"
                              "on: {0}".format(item.host))
            inputs_for_image_map = \
                self._create_input_map_for_sbgroup(
                                slices_per_image, n_subband_groups,
                                subbands_per_image, idx_sb_group, input_map)

            # Save the mapfile
            inputs_for_image_mapfile_path = os.path.join(
                job_directory, "mapfiles",
                "ms_per_image_{0}.map".format(idx_sb_group))

            self._store_data_map(inputs_for_image_mapfile_path,
                                 inputs_for_image_map, "inputmap for location")

            # skip the current step if skip is set, cannot use skip due to
            # the enumerate: dependency on the index in the map
            if item.skip == True:
                # assure that the mapfile is correct
                paths_to_image_mapfiles.append(tuple([item.host, [], True]))
                continue

            #save the (input) ms, as a list of  mapfiles
            paths_to_image_mapfiles.append(
                tuple([item.host, inputs_for_image_mapfile_path, False]))

            # use unique working directories per job, to prevent interference between jobs on a global fs
            working_dir = os.path.join(
                self.inputs['working_directory'],
                "imager_prepare_{0}".format(idx_sb_group))

            arguments = [
                self.environment, self.inputs['parset'], working_dir,
                self.inputs['processed_ms_dir'], self.inputs['ndppp_exec'],
                item.file, slices_per_image, subbands_per_image,
                inputs_for_image_mapfile_path,
                self.inputs['asciistat_executable'],
                self.inputs['statplot_executable'],
                self.inputs['msselect_executable'],
                self.inputs['rficonsole_executable'],
                self.inputs['do_rficonsole'], self.inputs['add_beam_tables'],
                globalfs
            ]

            jobs.append(
                ComputeJob(item.host,
                           node_command,
                           arguments,
                           resources={"cores": self.inputs['nthreads']}))

        # Hand over the job(s) to the pipeline scheduler
        self._schedule_jobs(jobs)

        # *********************************************************************
        # validate the output, cleanup, return output
        if self.error.isSet():  #if one of the nodes failed
            self.logger.warn("Failed prepare_imager run detected: Generating "
                             "new output_ms_mapfile_path without failed runs:"
                             " {0}".format(output_ms_mapfile_path))

        concat_ms = copy.deepcopy(output_map)
        slices = []
        finished_runs = 0
        #scan the return dict for completed key
        # loop over the potential jobs including the skipped
        # If we have a skipped item, add the item to the slices with skip set
        jobs_idx = 0
        for item in concat_ms:
            # If this is an item that is skipped via the skip parameter in
            # the parset, append a skipped
            if item.skip:
                slices.append(tuple([item.host, [], True]))
                continue

            # we cannot use the skip iterator so we need to manually get the
            # current job from the list
            job = jobs[jobs_idx]

            # only save the slices if the node has completed succesfull
            if job.results["returncode"] == 0:
                finished_runs += 1
                slices.append(
                    tuple([item.host, job.results["time_slices"], False]))
            else:
                # Set the dataproduct to skipped!!
                item.skip = True
                slices.append(tuple([item.host, [], True]))
                msg = "Failed run on {0}. NOT Created: {1} ".format(
                    item.host, item.file)
                self.logger.warn(msg)

            # we have a non skipped workitem, increase the job idx
            jobs_idx += 1

        if finished_runs == 0:
            self.logger.error(
                "None of the started compute node finished:"
                "The current recipe produced no output, aborting")
            return 1

        # Write the output mapfiles:
        # concat.ms paths:
        self._store_data_map(output_ms_mapfile_path, concat_ms,
                             "mapfile with concat.ms")

        # timeslices
        MultiDataMap(slices).save(self.inputs['slices_mapfile'])
        self.logger.info(
            "Wrote MultiMapfile with produces timeslice: {0}".format(
                self.inputs['slices_mapfile']))

        #map with actual input mss.
        self._store_data_map(self.inputs["ms_per_image_mapfile"],
                             DataMap(paths_to_image_mapfiles),
                             "mapfile containing (used) input ms per image:")

        # Set the return values
        self.outputs['mapfile'] = output_ms_mapfile_path
        self.outputs['slices_mapfile'] = self.inputs['slices_mapfile']
        self.outputs['ms_per_image_mapfile'] = \
            self.inputs["ms_per_image_mapfile"]
        return 0

    def _create_input_map_for_sbgroup(self, slices_per_image, n_subband_groups,
                                      subbands_per_image, idx_sb_group,
                                      input_mapfile):
        """
        Creates an input mapfile:
        This is a subset of the complete input_mapfile based on the subband 
        details suplied: The input_mapfile is structured: First all subbands for
        a complete timeslice and the the next timeslice. The result value 
        contains all the information needed for a single subbandgroup to be
        computed on a single compute node
        """
        inputs_for_image = []
        # collect the inputs: first step over the time slices
        for idx_slice in range(slices_per_image):
            # calculate the first line for current time slice and subband group
            line_idx_start = idx_slice * \
                (n_subband_groups * subbands_per_image) + \
                (idx_sb_group * subbands_per_image)
            line_idx_end = line_idx_start + subbands_per_image

            #extend inputs with the files for the current time slice
            inputs_for_image.extend(input_mapfile[line_idx_start:line_idx_end])

        return DataMap(inputs_for_image)

    def _validate_input_map(self, input_map, output_map, slices_per_image,
                            subbands_per_image):
        """
        Return False if the inputs supplied are incorrect:
        the number if inputs and  output does not match. 
        Return True if correct.              
        The number of inputs is correct iff.
        len(input_map) == 
        len(output_map) * slices_per_image * subbands_per_image
        """
        # The output_map contains a number of path/node pairs. The final data
        # dataproduct of the prepare phase: The 'input' for each of these pairs
        # is a number of measurement sets: The number of time slices times
        # the number of subbands collected into each of these time slices.
        # The total length of the input map should match this.
        if len(input_map) != len(output_map) * \
                                   (slices_per_image * subbands_per_image):
            self.logger.error(
                "Incorrect number of input ms for supplied parameters:\n\t"
                "len(input_map) = {0}\n\t"
                "len(output_map) * slices_per_image * subbands_per_image = "
                "{1} * {2} * {3} = {4}".format(
                    len(input_map), len(output_map), slices_per_image,
                    subbands_per_image,
                    len(output_map) * slices_per_image * subbands_per_image))
            return False

        return True
Ejemplo n.º 7
0
class executable_args(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    Basic script for running an executable with arguments.
    Passing a mapfile along so the executable can process MS.
    """
    inputs = {
        'executable': ingredient.ExecField(
            '--executable',
            help="The full path to the relevant executable",
            optional=True
        ),
        'arguments': ingredient.ListField(
            '-a', '--arguments',
            help="List of arguments for the executable. Will be added as ./exe arg0 arg1...",
            default='',
            optional=True
        ),
        'nthreads': ingredient.IntField(
            '--nthreads',
            default=8,
            help="Number of threads per process"
        ),
        'nodescript': ingredient.StringField(
            '--nodescript',
            help="Name of the node script to execute",
            default='executable_args',
            optional=True
        ),
        'parset': ingredient.FileField(
            '-p', '--parset',
            help="Path to the arguments for this executable. Will be converted to --key=value",
            optional=True
        ),
        'inputkey': ingredient.StringField(
            '-i', '--inputkey',
            help="Parset key that the executable will recognize as key for inputfile",
            default='',
            optional=True
        ),
        'outputkey': ingredient.StringField(
            '-0', '--outputkey',
            help="Parset key that the executable will recognize as key for outputfile",
            default='',
            optional=True
        ),
        'inputkeys': ingredient.ListField(
            '--inputkeys',
            help="List of parset keys that the executable will recognize as key for inputfile",
            default=[],
            optional=True
        ),
        'outputkeys': ingredient.ListField(
            '--outputkeys',
            help="List of parset keys that the executable will recognize as key for outputfile",
            default=[],
            optional=True
        ),
        'mapfiles_in': ingredient.ListField(
            '--mapfiles-in',
            help="List of the input mapfiles containing the names of the "
                 "data to run the recipe on",
            default=[],
            optional=True
        ),
        'mapfiles_as_string': ingredient.ListField(
            '--mapfiles_as_string',
            help="List of the input mapfiles to ignore and just use the name string instead.",
            default=[],
            optional=True
        ),
        'mapfiles_out': ingredient.ListField(
            '--mapfiles-out',
            help="List of the output mapfiles containing the names of the "
                 "data produced by the recipe",
            default=[],
            optional=True
        ),
        'mapfile_in': ingredient.StringField(
            '--mapfile-in',
            help="Name of the input mapfile containing the names of the "
                 "MS-files to run the recipe",
            default='',
            optional=True
        ),
        'mapfile_out': ingredient.StringField(
            '--mapfile-out',
            help="Name of the output mapfile containing the names of the "
                 "MS-files produced by the recipe",
            default='',
            optional=True
        ),
        'skip_infile': ingredient.BoolField(
            '--skip-infile',
            help="Dont give the input file to the executable.",
            default=False,
            optional=True
        ),
        'skip_outfile': ingredient.BoolField(
            '--skip-outfile',
            help="Dont produce an output file",
            default=False,
            optional=True
        ),
        'inplace': ingredient.BoolField(
            '--inplace',
            help="Manipulate input files inplace",
            default=False,
            optional=True
        ),
        'outputsuffixes': ingredient.ListField(
            '--outputsuffixes',
            help="Suffixes for the outputfiles",
            default=[]
        ),
        'parsetasfile': ingredient.BoolField(
            '--parsetasfile',
            help="Will the argument be a parsetfile or --opt=var",
            default=False
        ),
        'args_format': ingredient.StringField(
            '--args_format',
            help="Will change the format of the arguments. Standard definitions are...dont know yet",
            default='gnu'
        ),
        'args_format_argument': ingredient.StringField(
            '--args_format_argument',
            help="Will change the format of the arguments without option fields.",
            default=''
        ),
        'args_format_option': ingredient.StringField(
            '--args_format_option',
            help="Will change the format of option fields.",
            default='-'
        ),
        'args_format_longoption': ingredient.StringField(
            '--args_format_longoption',
            help="Will change the format of long option fields. Typically '--'",
            default='--'
        ),
        'args_format_option_argument': ingredient.StringField(
            '--args_format_option_argument',
            help="Will change the format of the arguments without option fields.",
            default='='
        ),
        'max_per_node': ingredient.IntField(
            '--max_per_node',
            help="Sets the number of jobs per node",
            default=0
        ),
        'stepname': ingredient.StringField(
            '--stepname',
            help="stepname for individual naming of results",
            optional=True
        ),
        'environment': ingredient.DictField(
            '--environment',
            help="Update environment variables for this step.",
            optional=True
        ),
        'error_tolerance': ingredient.BoolField(
            '--error_tolerance',
            help="Controls if the program exits on the first error or continues with succeeded MS.",
            default=True,
            optional=True
        )
    }

    outputs = {
        'mapfile': ingredient.FileField(
            help="The full path to a mapfile describing the processed data"
        )
    }

    def go(self):
        if 'executable' in self.inputs:
            executable = self.inputs['executable']

        if self.inputs['nthreads']:
            self.environment["OMP_NUM_THREADS"] = str(self.inputs['nthreads'])

        if 'environment' in self.inputs:
            self.environment.update(self.inputs['environment'])

        self.logger.info("Starting %s run" % executable)
        super(executable_args, self).go()

        # args format stuff
        args_format = {'args_format': self.inputs['args_format'],
                       'args_format_argument': self.inputs['args_format_argument'],
                       'args_format_option': self.inputs['args_format_option'],
                       'args_formatlongoption': self.inputs['args_format_longoption'],
                       'args_format_option_argument': self.inputs['args_format_option_argument']}
        mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles")
        work_dir = os.path.join(self.inputs['working_directory'], self.inputs['job_name'])
        # *********************************************************************
        # try loading input/output data file, validate output vs the input location if
        #    output locations are provided
        try:
            inputmapfiles = []
            inlist = []
            if self.inputs['mapfile_in']:
                inlist.append(self.inputs['mapfile_in'])

            if self.inputs['mapfiles_in']:
                for item in self.inputs['mapfiles_in']:
                    inlist.append(item)
                self.inputs['mapfile_in'] = self.inputs['mapfiles_in'][0]

            for item in inlist:
                inputmapfiles.append(DataMap.load(item))

        except Exception:
            self.logger.error('Could not load input Mapfile %s' % inlist)
            return 1

        outputmapfiles = []
        if self.inputs['mapfile_out']:
            try:
                outdata = DataMap.load(self.inputs['mapfile_out'])
                outputmapfiles.append(outdata)
            except Exception:
                self.logger.error('Could not load output Mapfile %s' % self.inputs['mapfile_out'])
                return 1
            # sync skip fields in the mapfiles
            align_data_maps(inputmapfiles[0], outputmapfiles[0])

        elif self.inputs['mapfiles_out']:
            for item in self.inputs['mapfiles_out']:
                outputmapfiles.append(DataMap.load(item))
            self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0]

        else:
            # ouput will be directed in the working directory if no output mapfile is specified
            outdata = copy.deepcopy(inputmapfiles[0])
            if not self.inputs['inplace']:
                for item in outdata:
                    item.file = os.path.join(
                        self.inputs['working_directory'],
                        self.inputs['job_name'],
                        #os.path.basename(item.file) + '.' + os.path.split(str(executable))[1]
                        os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname']
                    )
                self.inputs['mapfile_out'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + 'mapfile')
                self.inputs['mapfiles_out'].append(self.inputs['mapfile_out'])
            else:
                self.inputs['mapfile_out'] = self.inputs['mapfile_in']
                self.inputs['mapfiles_out'].append(self.inputs['mapfile_out'])
            outputmapfiles.append(outdata)

        if not validate_data_maps(inputmapfiles[0], outputmapfiles[0]):
            self.logger.error(
                "Validation of data mapfiles failed!"
            )
            return 1

        if self.inputs['outputsuffixes']:
            # Handle multiple outputfiles
            for name in self.inputs['outputsuffixes']:
                outputmapfiles.append(copy.deepcopy(inputmapfiles[0]))
                self.inputs['mapfiles_out'].append(os.path.join(mapfile_dir, self.inputs['stepname'] + name + '.' + 'mapfile'))
                for item in outputmapfiles[-1]:
                    item.file = os.path.join(
                        work_dir,
                        os.path.splitext(os.path.basename(item.file))[0] + '.' + self.inputs['stepname'] + name
                    )
            self.inputs['mapfile_out'] = self.inputs['mapfiles_out'][0]

        # prepare arguments
        arglist = self.inputs['arguments']
        parsetdict = {}
        if 'parset' in self.inputs:
            parset = Parset()
            parset.adoptFile(self.inputs['parset'])
            for k in parset.keys:
                parsetdict[k] = str(parset[k])

        # construct multiple input data
        if self.inputs['inputkey'] and not self.inputs['inputkey'] in self.inputs['inputkeys']:
            self.inputs['inputkeys'].insert(0, self.inputs['inputkey'])

        if not self.inputs['outputkeys'] and self.inputs['outputkey']:
            self.inputs['outputkeys'].append(self.inputs['outputkey'])

        if not self.inputs['skip_infile'] and len(self.inputs['inputkeys']) is not len(inputmapfiles):
            self.logger.error("Number of input mapfiles %d and input keys %d have to match." %
                              (len(inputmapfiles), len(self.inputs['inputkeys'])))
            return 1

        filedict = {}
        if self.inputs['inputkeys'] and not self.inputs['skip_infile']:
            for key, filemap, mapname in zip(self.inputs['inputkeys'], inputmapfiles, inlist):
                if not mapname in self.inputs['mapfiles_as_string']:
                    filedict[key] = []
                    for inp in filemap:
                        filedict[key].append(inp.file)
                else:
                    if key != mapname:
                        filedict[key] = []
                        for inp in filemap:
                            filedict[key].append(mapname)

        if self.inputs['outputkey']:
            filedict[self.inputs['outputkey']] = []
            for item in outputmapfiles[0]:
                filedict[self.inputs['outputkey']].append(item.file)

        # ********************************************************************
        # Call the node side of the recipe
        # Create and schedule the compute jobs
        #command = "python3 %s" % (self.__file__.replace('master', 'nodes')).replace('executable_args', self.inputs['nodescript'])
        recipe_dir_str = str(self.config.get('DEFAULT', 'recipe_directories'))
        recipe_directories = recipe_dir_str.rstrip(']').lstrip('[').split(',')
        pylist = os.getenv('PYTHONPATH').split(':')
        command = None
        for pl in pylist:
            if os.path.isfile(os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py')):
                command = "python3 %s" % os.path.join(pl,'lofarpipe/recipes/nodes/'+self.inputs['nodescript']+'.py')
        for pl in recipe_directories:
            if os.path.isfile(os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py')):
                command = "python3 %s" % os.path.join(pl,'nodes/'+self.inputs['nodescript']+'.py')

        inputmapfiles[0].iterator = outputmapfiles[0].iterator = DataMap.SkipIterator
        jobs = []
        for i, (outp, inp,) in enumerate(zip(
            outputmapfiles[0], inputmapfiles[0])
        ):
            arglist_copy = copy.deepcopy(arglist)
            parsetdict_copy = copy.deepcopy(parsetdict)

            if filedict:
                for name, value in filedict.items():
                    replaced = False
                    if arglist_copy:
                        for arg in arglist:
                            if name == arg:
                                ind = arglist_copy.index(arg)
                                arglist_copy[ind] = arglist_copy[ind].replace(name, value[i])
                                replaced = True
                    if parsetdict_copy:
                        if name in list(parsetdict_copy.values()):
                            for k, v in parsetdict_copy.items():
                                if v == name:
                                    parsetdict_copy[k] = value[i]
                        else:
                            if not replaced:
                                parsetdict_copy[name] = value[i]

            jobs.append(
                ComputeJob(
                    inp.host, command,
                    arguments=[
                        inp.file,
                        executable,
                        arglist_copy,
                        parsetdict_copy,
                        work_dir,
                        self.inputs['parsetasfile'],
                        args_format,
                        self.environment
                    ],
                    resources={
                        "cores": self.inputs['nthreads']
                    }
                )
            )
        max_per_node = self.inputs['max_per_node']
        self._schedule_jobs(jobs, max_per_node)
        jobresultdict = {}
        resultmap = {}
        for job, outp in zip(jobs, outputmapfiles[0]):
            if job.results['returncode'] != 0:
                outp.skip = True
                if not self.inputs['error_tolerance']:
                    self.logger.error("A job has failed with returncode %d and error_tolerance is not set. Bailing out!" % job.results['returncode'])
                    return 1
            for k, v in list(job.results.items()):
                if not k in jobresultdict:
                    jobresultdict[k] = []
                jobresultdict[k].append(DataProduct(job.host, job.results[k], outp.skip))
                if k == 'break':
                    self.outputs.update({'break': v})

        # temp solution. write all output dict entries to a mapfile
        #mapfile_dir = os.path.join(self.config.get("layout", "job_directory"), "mapfiles")
        #check directory for stand alone mode
        if not os.path.isdir(mapfile_dir):
            try:
                os.mkdir(mapfile_dir, )
            except OSError as exc:  # Python >2.5
                if exc.errno == errno.EEXIST and os.path.isdir(mapfile_dir):
                    pass
                else:
                    raise
        for k, v in list(jobresultdict.items()):
            dmap = DataMap(v)
            dmap.save(os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile'))
            resultmap[k + '.mapfile'] = os.path.join(mapfile_dir, self.inputs['stepname'] + '.' + k + '.mapfile')
        self.outputs.update(resultmap)
        # *********************************************************************
        # Check job results, and create output data map file
        if self.error.isSet():
            # Abort if all jobs failed
            if all(job.results['returncode'] != 0 for job in jobs):
                self.logger.error("All jobs failed. Bailing out!")
                return 1
            else:
                self.logger.warn(
                    "Some jobs failed, continuing with succeeded runs"
                )
        mapdict = {}
        for item, name in zip(outputmapfiles, self.inputs['mapfiles_out']):
            self.logger.debug("Writing data map file: %s" % name)
            item.save(name)
            mapdict[os.path.basename(name)] = name

        self.outputs['mapfile'] = self.inputs['mapfile_out']
        if self.inputs['outputsuffixes']:
            self.outputs.update(mapdict)

        return 0
Ejemplo n.º 8
0
class rficonsole(BaseRecipe, RemoteCommandRecipeMixIn):
    """
    The rficonsole recipe runs the rficonsole executable (flagger) across one
    or more MeasurementSets.

    **Arguments**

    A mapfile describing the data to be processed.
    """
    inputs = {
        'executable':
        ingredient.ExecField('--executable',
                             default="/opt/LofIm/daily/lofar/bin/rficonsole",
                             help="Full path to rficonsole executable"),
        'strategy':
        ingredient.FileField('--strategy',
                             help="Full path to RFI strategy file",
                             optional=True),
        'indirect_read':
        ingredient.BoolField(
            '--indirect-read',
            default=False,
            help="Indirect baseline reader: re-write MS for efficiency"),
        'skip_flagged':
        ingredient.BoolField(
            '--skip-flagged',
            default=True,
            help="Ignore any MeasurementSet which has been flagged completely"
        ),
        'working_dir':
        ingredient.StringField(
            '--working-dir',
            default='/tmp',
            help=
            "Temporary rficonsole products are stored under this root on each of the remote machines. This directory should therefore be writable on each machine, but need not be shared across hosts"
        ),
        'nthreads':
        ingredient.IntField('--nthreads',
                            default=8,
                            help="Number of threads per rficonsole process"),
        'nproc':
        ingredient.IntField(
            '--nproc',
            default=1,
            help="Maximum number of simultaneous processes per node"),
        'nmeasurementsets':
        ingredient.IntField(
            '--nmeasurementsets',
            optional=True,
            help=
            "Maximum number of MeasurementSets processed by a single rficonsole process"
        ),
    }

    def go(self):
        self.logger.info("Starting rficonsole run")
        super(rficonsole, self).go()

        #                           Load file <-> compute node mapping from disk
        # ----------------------------------------------------------------------
        self.logger.debug("Loading map from %s" % self.inputs['args'])
        data = load_data_map(self.inputs['args'][0])

        #        Jobs being dispatched to each host are arranged in a dict. Each
        #            entry in the dict is a list of list of filnames to process.
        # ----------------------------------------------------------------------
        hostlist = defaultdict(lambda: list([[]]))
        for host, filename in data:
            if (self.inputs.has_key('nmeasurementsets') and len(
                    hostlist[host][-1]) >= self.inputs['nmeasurementsets']):
                hostlist[host].append([filename])
            else:
                hostlist[host][-1].append(filename)

        if self.inputs.has_key('strategy'):
            strategy = self.inputs['strategy']
        else:
            strategy = None

        command = "python %s" % (self.__file__.replace('master', 'nodes'))
        jobs = []
        for host, file_lists in hostlist.iteritems():
            for file_list in file_lists:
                jobs.append(
                    ComputeJob(
                        host,
                        command,
                        arguments=[
                            self.inputs['executable'], self.inputs['nthreads'],
                            strategy, self.inputs['indirect_read'],
                            self.inputs['skip_flagged'],
                            self.inputs['working_dir']
                        ] + file_list,
                        resources={"cores": self.inputs['nthreads']}))
        self._schedule_jobs(jobs, max_per_node=self.inputs['nproc'])

        if self.error.isSet():
            self.logger.warn("Failed rficonsole process detected")
            return 1
        else:
            return 0