def _schedule_jobs(self, jobs, max_per_node = None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) # External cluster stuff try: method = self.config.get('remote', 'method') except: method = None redistribute_hosts = False # JURECA SLURM if method == 'slurm_srun': nodeliststr = [] hargs = ['srun','hostname'] proc = Popen(hargs, False, stdout=PIPE, stderr=None) tup = proc.communicate() nodeliststr = tup[0].rstrip('\n').split('\n') # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # Hertfordshire cluster elif method == 'pbs_ssh': # special case - get the list of nodes from the pbs job nodeliststr = [] try: filename = os.environ['PBS_NODEFILE'] except KeyError: self.logger.error('PBS_NODEFILE not found.') raise PipelineQuit() with open(filename, 'r') as file: for line in file: node_name = line.split()[0] if node_name not in nodeliststr: nodeliststr.append(node_name) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # get hostlist from slurm, but start jobs via ssh elif method == 'slurm_ssh': try: hostlist = os.environ['SLURM_JOB_NODELIST'] except KeyError: self.logger.error('SLURM_JOB_NODELIST not found. You must have a slurm reservation!') raise PipelineQuit() nodeliststr = expand_slurm_hostlist(hostlist) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # generic case, node-names in an env-variable elif method == 'ssh_generic': nodeliststr = [] try: env_name = self.config.get('remote', 'nodelist_variable') except: env_name = 'PIPELINE_NODES' try: nodes = os.environ[env_name] except KeyError: self.logger.error('Env-variable \"'+env_name+'\" not found.') raise PipelineQuit() nodeliststr = [node.strip() for node in nodes.strip('[] ').split(',')] # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # re-distribute the hosts if requested if redistribute_hosts: # equal distribution total = len(jobs) # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore nnodes = len(nodeliststr) # round robin nodelist = [] for i in range(total): nodelist.append(nodeliststr[i%nnodes]) for i, job in enumerate(jobs): job.host = nodelist[i] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread( target = job.dispatch, args = ( self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch ) ) ) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute("duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute( "returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help = "XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii") return jobpool
def _schedule_jobs(self, jobs, max_per_node = None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread( target = job.dispatch, args = ( self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch ) ) ) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("duration", str(job.results["job_duration"])) # return code if present (Not there on error if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute( "returncode", str(-1)) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help = "XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii") return jobpool
def go(self): self.logger.info("Starting BBS run") super(bbs, self).go() # Generate source and parameter databases for all input data # ---------------------------------------------------------------------- inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['parmdbm'] inputs['working_directory'] = self.config.get( "DEFAULT", "default_working_directory") inputs['mapfile'] = self.task_definitions.get('parmdb', 'mapfile') inputs['suffix'] = ".instrument" outputs = LOFARoutput(self.inputs) if self.cook_recipe('parmdb', inputs, outputs): self.logger.warn("parmdb reports failure") return 1 inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['makesourcedb'] inputs['skymodel'] = self.inputs['skymodel'] inputs['mapfile'] = self.task_definitions.get('sourcedb', 'mapfile') inputs['suffix'] = ".sky" outputs = LOFARoutput(self.inputs) if self.cook_recipe('sourcedb', inputs, outputs): self.logger.warn("sourcedb reports failure") return 1 # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for BBS") vds_file = os.path.join(self.config.get("layout", "job_directory"), "vds", "bbs.gvds") inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = vds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(vds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("BBS GVDS is %s" % (vds_file, )) # Iterate over groups of subbands divided up for convenient cluster # procesing -- ie, no more than nproc subbands per compute node # ---------------------------------------------------------------------- for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])): # to_process is a list of (host, filename, vds) tuples # ------------------------------------------------------------------ hosts, ms_names, vds_files = map(list, zip(*to_process)) # The BBS session database should be cleared for our key # ------------------------------------------------------------------ self.logger.debug("Cleaning BBS database for key %s" % (self.inputs["key"])) with closing( psycopg2.connect( host=self.inputs["db_host"], user=self.inputs["db_user"], database=self.inputs["db_name"])) as db_connection: db_connection.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) with closing(db_connection.cursor()) as db_cursor: db_cursor.execute( "DELETE FROM blackboard.session WHERE key=%s", (self.inputs["key"], )) # BBS GlobalControl requires a GVDS file describing all the data # to be processed. We assemble that from the separate parts # already available on disk. # ------------------------------------------------------------------ self.logger.debug("Building VDS file describing data for BBS run") vds_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__), )) vds_file = os.path.join(vds_dir, "bbs.gvds") combineproc = utilities.spawn_process([ self.inputs['combinevds'], vds_file, ] + vds_files, self.logger) sout, serr = combineproc.communicate() log_process_output(self.inputs['combinevds'], sout, serr, self.logger) if combineproc.returncode != 0: raise subprocess.CalledProcessError(combineproc.returncode, command) # Construct a parset for BBS GlobalControl by patching the GVDS # file and database information into the supplied template # ------------------------------------------------------------------ self.logger.debug("Building parset for BBS control") bbs_parset = utilities.patch_parset( self.inputs['parset'], { 'Observation': vds_file, 'BBDB.Key': self.inputs['key'], 'BBDB.Name': self.inputs['db_name'], 'BBDB.User': self.inputs['db_user'], 'BBDB.Host': self.inputs['db_host'], # 'BBDB.Port': self.inputs['db_name'], }) self.logger.debug("BBS control parset is %s" % (bbs_parset, )) try: # When one of our processes fails, we set the killswitch. # Everything else will then come crashing down, rather than # hanging about forever. # -------------------------------------------------------------- self.killswitch = threading.Event() self.killswitch.clear() signal.signal(signal.SIGTERM, self.killswitch.set) # GlobalControl runs in its own thread # -------------------------------------------------------------- run_flag = threading.Event() run_flag.clear() bbs_control = threading.Thread(target=self._run_bbs_control, args=(bbs_parset, run_flag)) bbs_control.start() run_flag.wait() # Wait for control to start before proceeding # We run BBS KernelControl on each compute node by directly # invoking the node script using SSH # Note that we use a job_server to send out job details and # collect logging information, so we define a bunch of # ComputeJobs. However, we need more control than the generic # ComputeJob.dispatch method supplies, so we'll control them # with our own threads. # -------------------------------------------------------------- command = "python %s" % (self.__file__.replace( 'master', 'nodes')) env = { "LOFARROOT": utilities.read_initscript( self.logger, self.inputs['initscript'])["LOFARROOT"], "PYTHONPATH": self.config.get('deploy', 'engine_ppath'), "LD_LIBRARY_PATH": self.config.get('deploy', 'engine_lpath') } jobpool = {} bbs_kernels = [] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job server at %s:%d" % (jobhost, jobport)) for job_id, details in enumerate(to_process): host, file, vds = details jobpool[job_id] = ComputeJob( host, command, arguments=[ self.inputs['kernel_exec'], self.inputs['initscript'], file, self.inputs['key'], self.inputs['db_name'], self.inputs['db_user'], self.inputs['db_host'] ]) bbs_kernels.append( threading.Thread(target=self._run_bbs_kernel, args=(host, command, env, job_id, jobhost, str(jobport)))) self.logger.info("Starting %d threads" % len(bbs_kernels)) [thread.start() for thread in bbs_kernels] self.logger.debug("Waiting for all kernels to complete") [thread.join() for thread in bbs_kernels] # When GlobalControl finishes, our work here is done # ---------------------------------------------------------- self.logger.info("Waiting for GlobalControl thread") bbs_control.join() finally: os.unlink(bbs_parset) shutil.rmtree(vds_dir) if self.killswitch.isSet(): # If killswitch is set, then one of our processes failed so # the whole run is invalid # ---------------------------------------------------------- return 1 return 0
def go(self): self.logger.info("Starting BBS run") super(new_bbs, self).go() # Check for relevant input parameters in the parset-file # --------------------------------------------------------------------- self.logger.debug("Reading parset from %s" % self.inputs['parset']) self.parset = parameterset(self.inputs['parset']) self._set_input('db_host', 'BBDB.Host') self._set_input('db_user', 'BBDB.User') self._set_input('db_name', 'BBDB.Name') self._set_input('db_key', 'BBDB.Key') #self.logger.debug("self.inputs = %s" % self.inputs) # Clean the blackboard database # --------------------------------------------------------------------- self.logger.info( "Cleaning BBS database for key '%s'" % (self.inputs['db_key']) ) command = ["psql", "-h", self.inputs['db_host'], "-U", self.inputs['db_user'], "-d", self.inputs['db_name'], "-c", "DELETE FROM blackboard.session WHERE key='%s';" % self.inputs['db_key'] ] self.logger.debug(command) if subprocess.call(command) != 0: self.logger.warning( "Failed to clean BBS database for key '%s'" % self.inputs['db_key'] ) # Create a bbs_map describing the file mapping on disk # --------------------------------------------------------------------- if not self._make_bbs_map(): return 1 # Produce a GVDS file, describing the data that must be processed. gvds_file = self.run_task( "vdsmaker", self.inputs['data_mapfile'], gvds=self.inputs['gvds'] )['gvds'] # Construct a parset for BBS GlobalControl by patching the GVDS # file and database information into the supplied template # ------------------------------------------------------------------ self.logger.debug("Building parset for BBS control") # Create a location for parsets job_directory = self.config.get( "layout", "job_directory") parset_directory = os.path.join(job_directory, "parsets") create_directory(parset_directory) # patch the parset and copy result to target location remove tempfile try: bbs_parset = utilities.patch_parset( self.parset, { 'Observation': gvds_file, 'BBDB.Key': self.inputs['db_key'], 'BBDB.Name': self.inputs['db_name'], 'BBDB.User': self.inputs['db_user'], 'BBDB.Host': self.inputs['db_host'], #'BBDB.Port': self.inputs['db_name'], } ) bbs_parset_path = os.path.join(parset_directory, "bbs_control.parset") shutil.copyfile(bbs_parset, bbs_parset_path) self.logger.debug("BBS control parset is %s" % (bbs_parset_path,)) finally: # Always remove the file in the tempdir os.remove(bbs_parset) try: # When one of our processes fails, we set the killswitch. # Everything else will then come crashing down, rather than # hanging about forever. # -------------------------------------------------------------- self.killswitch = threading.Event() self.killswitch.clear() signal.signal(signal.SIGTERM, self.killswitch.set) # GlobalControl runs in its own thread # -------------------------------------------------------------- run_flag = threading.Event() run_flag.clear() bbs_control = threading.Thread( target=self._run_bbs_control, args=(bbs_parset, run_flag) ) bbs_control.start() run_flag.wait() # Wait for control to start before proceeding # We run BBS KernelControl on each compute node by directly # invoking the node script using SSH # Note that we use a job_server to send out job details and # collect logging information, so we define a bunch of # ComputeJobs. However, we need more control than the generic # ComputeJob.dispatch method supplies, so we'll control them # with our own threads. # -------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) jobpool = {} bbs_kernels = [] with job_server(self.logger, jobpool, self.error) as(jobhost, jobport): self.logger.debug("Job server at %s:%d" % (jobhost, jobport)) for job_id, details in enumerate(self.bbs_map): host, files = details jobpool[job_id] = ComputeJob( host, command, arguments=[ self.inputs['kernel_exec'], files, self.inputs['db_key'], self.inputs['db_name'], self.inputs['db_user'], self.inputs['db_host'] ] ) bbs_kernels.append( threading.Thread( target=self._run_bbs_kernel, args=(host, command, job_id, jobhost, str(jobport)) ) ) self.logger.info("Starting %d threads" % len(bbs_kernels)) for thread in bbs_kernels: thread.start() self.logger.debug("Waiting for all kernels to complete") for thread in bbs_kernels: thread.join() # When GlobalControl finishes, our work here is done # ---------------------------------------------------------- self.logger.info("Waiting for GlobalControl thread") bbs_control.join() finally: os.unlink(bbs_parset) if self.killswitch.isSet(): # If killswitch is set, then one of our processes failed so # the whole run is invalid # ---------------------------------------------------------- return 1 self.outputs['mapfile'] = self.inputs['data_mapfile'] return 0
def _schedule_jobs(self, jobs, max_per_node=None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) # External cluster stuff try: method = self.config.get('remote', 'method') except: method = None redistribute_hosts = False # JURECA SLURM if method == 'slurm_srun': nodeliststr = [] hargs = ['srun', 'hostname'] proc = Popen(hargs, False, stdout=PIPE, stderr=None) tup = communicate_returning_strings(proc) nodeliststr = tup[0].rstrip('\n').split('\n') # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # Hertfordshire cluster elif method == 'pbs_ssh': # special case - get the list of nodes from the pbs job nodeliststr = [] try: filename = os.environ['PBS_NODEFILE'] except KeyError: self.logger.error('PBS_NODEFILE not found.') raise PipelineQuit() with open(filename, 'r') as file: for line in file: node_name = line.split()[0] if node_name not in nodeliststr: nodeliststr.append(node_name) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # get hostlist from slurm, but start jobs via ssh elif method == 'slurm_ssh': try: hostlist = os.environ['SLURM_JOB_NODELIST'] except KeyError: self.logger.error( 'SLURM_JOB_NODELIST not found. You must have a slurm reservation!' ) raise PipelineQuit() nodeliststr = expand_slurm_hostlist(hostlist) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # generic case, node-names in an env-variable elif method == 'ssh_generic': nodeliststr = [] try: env_name = self.config.get('remote', 'nodelist_variable') except: env_name = 'PIPELINE_NODES' try: nodes = os.environ[env_name] except KeyError: self.logger.error('Env-variable \"' + env_name + '\" not found.') raise PipelineQuit() nodeliststr = [ node.strip() for node in nodes.strip('[] ').split(',') ] # remove duplicates. order not important nodeliststr = list(set(nodeliststr)) # set flag to re-distribute the hosts for the jobs redistribute_hosts = True # re-distribute the hosts if requested if redistribute_hosts: # equal distribution total = len(jobs) # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore nnodes = len(nodeliststr) # round robin nodelist = [] for i in range(total): nodelist.append(nodeliststr[i % nnodes]) for i, job in enumerate(jobs): job.host = nodelist[i] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread(target=job.dispatch, args=(self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch))) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute( "duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute("returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help="XML return data.") self.outputs["return_xml"] = node_durations.toxml( encoding="ascii").decode('ascii') return jobpool
def _schedule_jobs(self, jobs, max_per_node=None): """ Schedule a series of compute jobs. Blocks until completion. :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled :param max_per_node: maximum number of simultaneous jobs on any given node :type max_per_node: integer or none :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob` """ threadpool = [] jobpool = {} if not max_per_node and self.config.has_option('remote', 'max_per_node'): max_per_node = self.config.getint('remote', 'max_per_node') limiter = ProcessLimiter(max_per_node) killswitch = threading.Event() if max_per_node: self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node) with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport)) for job_id, job in enumerate(jobs): jobpool[job_id] = job threadpool.append( threading.Thread(target=job.dispatch, args=(self.logger, self.config, limiter, job_id, jobhost, jobport, self.error, killswitch))) threadwatcher(threadpool, self.logger, killswitch) if killswitch.isSet(): raise PipelineQuit() # Add information regarding specific nodes to an xml node. self.logger.debug("Adding node_logging_information") local_document = xml.Document() node_durations = local_document.createElement("nodes") for job_id, job in enumerate(jobs): # Test if the duration is there # fixme the name of node_durations is not logical if "job_duration" in job.results: child_node_duration = add_child(node_durations, "job") child_node_duration.setAttribute("job_id", str(job_id)) child_node_duration.setAttribute("job_host", str(job.host)) child_node_duration.setAttribute( "duration", str(job.results["job_duration"])) # return code if present (Not there on error) if "returncode" in job.results: child_node_duration.setAttribute( "returncode", str(job.results['returncode'])) else: child_node_duration.setAttribute("returncode", str(-1)) ## If there is 'node level' resource logging available if "monitor_stats" in job.results: return_node = xml.parseString( job.results['monitor_stats']).documentElement child_node_duration.appendChild(return_node) # manually add the result xml as an ingredient output. # this allows backward compatible logging: If not read an additional # output does not matter self.outputs._fields["return_xml"] = ingredient.StringField( help="XML return data.") self.outputs["return_xml"] = node_durations.toxml(encoding="ascii") return jobpool
def go(self): self.logger.info("Starting BBS run") super(bbs, self).go() # Generate source and parameter databases for all input data # ---------------------------------------------------------------------- inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['parmdbm'] inputs['working_directory'] = self.config.get( "DEFAULT", "default_working_directory" ) inputs['mapfile'] = self.task_definitions.get('parmdb','mapfile') inputs['suffix'] = ".instrument" outputs = LOFARoutput(self.inputs) if self.cook_recipe('parmdb', inputs, outputs): self.logger.warn("parmdb reports failure") return 1 inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['makesourcedb'] inputs['skymodel'] = self.inputs['skymodel'] inputs['mapfile'] = self.task_definitions.get('sourcedb','mapfile') inputs['suffix'] = ".sky" outputs = LOFARoutput(self.inputs) if self.cook_recipe('sourcedb', inputs, outputs): self.logger.warn("sourcedb reports failure") return 1 # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for BBS") vds_file = os.path.join( self.config.get("layout", "job_directory"), "vds", "bbs.gvds" ) inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = vds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(vds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("BBS GVDS is %s" % (vds_file,)) # Iterate over groups of subbands divided up for convenient cluster # procesing -- ie, no more than nproc subbands per compute node # ---------------------------------------------------------------------- for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])): # to_process is a list of (host, filename, vds) tuples # ------------------------------------------------------------------ hosts, ms_names, vds_files = map(list, zip(*to_process)) # The BBS session database should be cleared for our key # ------------------------------------------------------------------ self.logger.debug( "Cleaning BBS database for key %s" % (self.inputs["key"]) ) with closing( psycopg2.connect( host=self.inputs["db_host"], user=self.inputs["db_user"], database=self.inputs["db_name"] ) ) as db_connection: db_connection.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT ) with closing(db_connection.cursor()) as db_cursor: db_cursor.execute( "DELETE FROM blackboard.session WHERE key=%s", (self.inputs["key"],) ) # BBS GlobalControl requires a GVDS file describing all the data # to be processed. We assemble that from the separate parts # already available on disk. # ------------------------------------------------------------------ self.logger.debug("Building VDS file describing data for BBS run") vds_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) vds_file = os.path.join(vds_dir, "bbs.gvds") combineproc = utilities.spawn_process( [ self.inputs['combinevds'], vds_file, ] + vds_files, self.logger ) sout, serr = combineproc.communicate() log_process_output(self.inputs['combinevds'], sout, serr, self.logger) if combineproc.returncode != 0: raise subprocess.CalledProcessError( combineproc.returncode, command ) # Construct a parset for BBS GlobalControl by patching the GVDS # file and database information into the supplied template # ------------------------------------------------------------------ self.logger.debug("Building parset for BBS control") bbs_parset = utilities.patch_parset( self.inputs['parset'], { 'Observation': vds_file, 'BBDB.Key': self.inputs['key'], 'BBDB.Name': self.inputs['db_name'], 'BBDB.User': self.inputs['db_user'], 'BBDB.Host': self.inputs['db_host'], # 'BBDB.Port': self.inputs['db_name'], } ) self.logger.debug("BBS control parset is %s" % (bbs_parset,)) try: # When one of our processes fails, we set the killswitch. # Everything else will then come crashing down, rather than # hanging about forever. # -------------------------------------------------------------- self.killswitch = threading.Event() self.killswitch.clear() signal.signal(signal.SIGTERM, self.killswitch.set) # GlobalControl runs in its own thread # -------------------------------------------------------------- run_flag = threading.Event() run_flag.clear() bbs_control = threading.Thread( target=self._run_bbs_control, args=(bbs_parset, run_flag) ) bbs_control.start() run_flag.wait() # Wait for control to start before proceeding # We run BBS KernelControl on each compute node by directly # invoking the node script using SSH # Note that we use a job_server to send out job details and # collect logging information, so we define a bunch of # ComputeJobs. However, we need more control than the generic # ComputeJob.dispatch method supplies, so we'll control them # with our own threads. # -------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) env = { "LOFARROOT": utilities.read_initscript(self.logger, self.inputs['initscript'])["LOFARROOT"], "PYTHONPATH": self.config.get('deploy', 'engine_ppath'), "LD_LIBRARY_PATH": self.config.get('deploy', 'engine_lpath') } jobpool = {} bbs_kernels = [] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job server at %s:%d" % (jobhost, jobport)) for job_id, details in enumerate(to_process): host, file, vds = details jobpool[job_id] = ComputeJob( host, command, arguments=[ self.inputs['kernel_exec'], self.inputs['initscript'], file, self.inputs['key'], self.inputs['db_name'], self.inputs['db_user'], self.inputs['db_host'] ] ) bbs_kernels.append( threading.Thread( target=self._run_bbs_kernel, args=(host, command, env, job_id, jobhost, str(jobport) ) ) ) self.logger.info("Starting %d threads" % len(bbs_kernels)) [thread.start() for thread in bbs_kernels] self.logger.debug("Waiting for all kernels to complete") [thread.join() for thread in bbs_kernels] # When GlobalControl finishes, our work here is done # ---------------------------------------------------------- self.logger.info("Waiting for GlobalControl thread") bbs_control.join() finally: os.unlink(bbs_parset) shutil.rmtree(vds_dir) if self.killswitch.isSet(): # If killswitch is set, then one of our processes failed so # the whole run is invalid # ---------------------------------------------------------- return 1 return 0