def _run_bbs_control(self, bbs_parset, run_flag): """ Run BBS Global Control and wait for it to finish. Return its return code. """ env = utilities.read_initscript(self.logger, self.inputs['initscript']) self.logger.info("Running BBS GlobalControl") working_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__), )) with CatchLog4CPlus(working_dir, self.logger.name + ".GlobalControl", os.path.basename(self.inputs['control_exec'])): with utilities.log_time(self.logger): try: bbs_control_process = utilities.spawn_process( [self.inputs['control_exec'], bbs_parset, "0"], self.logger, cwd=working_dir, env=env) # _monitor_process() needs a convenient kill() method. bbs_control_process.kill = lambda: os.kill( bbs_control_process.pid, signal.SIGKILL) except OSError, e: self.logger.error("Failed to spawn BBS Control (%s)" % str(e)) self.killswitch.set() return 1 finally:
def _run_bbs_control(self, bbs_parset, run_flag): """ Run BBS Global Control and wait for it to finish. Return its return code. """ env = utilities.read_initscript(self.logger, self.inputs['initscript']) self.logger.info("Running BBS GlobalControl") working_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) with CatchLog4CPlus( working_dir, self.logger.name + ".GlobalControl", os.path.basename(self.inputs['control_exec']) ): with utilities.log_time(self.logger): try: bbs_control_process = utilities.spawn_process( [ self.inputs['control_exec'], bbs_parset, "0" ], self.logger, cwd=working_dir, env=env ) # _monitor_process() needs a convenient kill() method. bbs_control_process.kill = lambda : os.kill(bbs_control_process.pid, signal.SIGKILL) except OSError, e: self.logger.error("Failed to spawn BBS Control (%s)" % str(e)) self.killswitch.set() return 1 finally:
def run(self, executable, initscript, infile, key, db_name, db_user, db_host): # executable: path to KernelControl executable # initscript: path to lofarinit.sh # infile: MeasurementSet for processing # key, db_name, db_user, db_host: database connection parameters # ---------------------------------------------------------------------- with log_time(self.logger): if os.path.exists(infile): self.logger.info("Processing %s" % (infile)) else: self.logger.error("Dataset %s does not exist" % (infile)) return 1 # Build a configuration parset specifying database parameters # for the kernel # ------------------------------------------------------------------ self.logger.debug("Setting up kernel parset") filesystem = "%s:%s" % (os.uname()[1], get_mountpoint(infile)) fd, parset_filename = mkstemp() kernel_parset = Parset() for key, value in { "ObservationPart.Filesystem": filesystem, "ObservationPart.Path": infile, "BBDB.Key": key, "BBDB.Name": db_name, "BBDB.User": db_user, "BBDB.Host": db_host, "ParmLog": "", "ParmLoglevel": "", "ParmDB.Sky": infile + ".sky", "ParmDB.Instrument": infile + ".instrument" }.iteritems(): kernel_parset.add(key, value) kernel_parset.writeFile(parset_filename) os.close(fd) self.logger.debug("Parset written to %s" % (parset_filename, )) # Run the kernel # Catch & log output from the kernel logger and stdout # ------------------------------------------------------------------ working_dir = mkdtemp() env = read_initscript(self.logger, initscript) try: cmd = [executable, parset_filename, "0"] self.logger.debug("Executing BBS kernel") with CatchLog4CPlus( working_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ): bbs_kernel_process = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=working_dir) sout, serr = bbs_kernel_process.communicate() log_process_output("BBS kernel", sout, serr, self.logger) if bbs_kernel_process.returncode != 0: raise CalledProcessError(bbs_kernel_process.returncode, executable) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
def go(self): self.logger.info("Starting BBS run") super(bbs, self).go() # Generate source and parameter databases for all input data # ---------------------------------------------------------------------- inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['parmdbm'] inputs['working_directory'] = self.config.get( "DEFAULT", "default_working_directory") inputs['mapfile'] = self.task_definitions.get('parmdb', 'mapfile') inputs['suffix'] = ".instrument" outputs = LOFARoutput(self.inputs) if self.cook_recipe('parmdb', inputs, outputs): self.logger.warn("parmdb reports failure") return 1 inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['makesourcedb'] inputs['skymodel'] = self.inputs['skymodel'] inputs['mapfile'] = self.task_definitions.get('sourcedb', 'mapfile') inputs['suffix'] = ".sky" outputs = LOFARoutput(self.inputs) if self.cook_recipe('sourcedb', inputs, outputs): self.logger.warn("sourcedb reports failure") return 1 # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for BBS") vds_file = os.path.join(self.config.get("layout", "job_directory"), "vds", "bbs.gvds") inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = vds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(vds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("BBS GVDS is %s" % (vds_file, )) # Iterate over groups of subbands divided up for convenient cluster # procesing -- ie, no more than nproc subbands per compute node # ---------------------------------------------------------------------- for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])): # to_process is a list of (host, filename, vds) tuples # ------------------------------------------------------------------ hosts, ms_names, vds_files = map(list, zip(*to_process)) # The BBS session database should be cleared for our key # ------------------------------------------------------------------ self.logger.debug("Cleaning BBS database for key %s" % (self.inputs["key"])) with closing( psycopg2.connect( host=self.inputs["db_host"], user=self.inputs["db_user"], database=self.inputs["db_name"])) as db_connection: db_connection.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) with closing(db_connection.cursor()) as db_cursor: db_cursor.execute( "DELETE FROM blackboard.session WHERE key=%s", (self.inputs["key"], )) # BBS GlobalControl requires a GVDS file describing all the data # to be processed. We assemble that from the separate parts # already available on disk. # ------------------------------------------------------------------ self.logger.debug("Building VDS file describing data for BBS run") vds_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__), )) vds_file = os.path.join(vds_dir, "bbs.gvds") combineproc = utilities.spawn_process([ self.inputs['combinevds'], vds_file, ] + vds_files, self.logger) sout, serr = combineproc.communicate() log_process_output(self.inputs['combinevds'], sout, serr, self.logger) if combineproc.returncode != 0: raise subprocess.CalledProcessError(combineproc.returncode, command) # Construct a parset for BBS GlobalControl by patching the GVDS # file and database information into the supplied template # ------------------------------------------------------------------ self.logger.debug("Building parset for BBS control") bbs_parset = utilities.patch_parset( self.inputs['parset'], { 'Observation': vds_file, 'BBDB.Key': self.inputs['key'], 'BBDB.Name': self.inputs['db_name'], 'BBDB.User': self.inputs['db_user'], 'BBDB.Host': self.inputs['db_host'], # 'BBDB.Port': self.inputs['db_name'], }) self.logger.debug("BBS control parset is %s" % (bbs_parset, )) try: # When one of our processes fails, we set the killswitch. # Everything else will then come crashing down, rather than # hanging about forever. # -------------------------------------------------------------- self.killswitch = threading.Event() self.killswitch.clear() signal.signal(signal.SIGTERM, self.killswitch.set) # GlobalControl runs in its own thread # -------------------------------------------------------------- run_flag = threading.Event() run_flag.clear() bbs_control = threading.Thread(target=self._run_bbs_control, args=(bbs_parset, run_flag)) bbs_control.start() run_flag.wait() # Wait for control to start before proceeding # We run BBS KernelControl on each compute node by directly # invoking the node script using SSH # Note that we use a job_server to send out job details and # collect logging information, so we define a bunch of # ComputeJobs. However, we need more control than the generic # ComputeJob.dispatch method supplies, so we'll control them # with our own threads. # -------------------------------------------------------------- command = "python %s" % (self.__file__.replace( 'master', 'nodes')) env = { "LOFARROOT": utilities.read_initscript( self.logger, self.inputs['initscript'])["LOFARROOT"], "PYTHONPATH": self.config.get('deploy', 'engine_ppath'), "LD_LIBRARY_PATH": self.config.get('deploy', 'engine_lpath') } jobpool = {} bbs_kernels = [] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job server at %s:%d" % (jobhost, jobport)) for job_id, details in enumerate(to_process): host, file, vds = details jobpool[job_id] = ComputeJob( host, command, arguments=[ self.inputs['kernel_exec'], self.inputs['initscript'], file, self.inputs['key'], self.inputs['db_name'], self.inputs['db_user'], self.inputs['db_host'] ]) bbs_kernels.append( threading.Thread(target=self._run_bbs_kernel, args=(host, command, env, job_id, jobhost, str(jobport)))) self.logger.info("Starting %d threads" % len(bbs_kernels)) [thread.start() for thread in bbs_kernels] self.logger.debug("Waiting for all kernels to complete") [thread.join() for thread in bbs_kernels] # When GlobalControl finishes, our work here is done # ---------------------------------------------------------- self.logger.info("Waiting for GlobalControl thread") bbs_control.join() finally: os.unlink(bbs_parset) shutil.rmtree(vds_dir) if self.killswitch.isSet(): # If killswitch is set, then one of our processes failed so # the whole run is invalid # ---------------------------------------------------------- return 1 return 0
def run(self, infile, working_dir, initscript, remove, target, clusterdesc, timestep, freqstep, half_window, threshold, demixdir, skymodel, db_host): with log_time(self.logger): if os.path.exists(infile): self.logger.info("Started processing %s" % infile) else: self.logger.error("Dataset %s does not exist" % infile) return 1 self.logger.debug("infile = %s", infile) self.logger.debug("working_dir = %s", working_dir) self.logger.debug("initscript = %s", initscript) self.logger.debug("remove = %s", remove) self.logger.debug("target = %s", target) self.logger.debug("clusterdesc = %s", clusterdesc) self.logger.debug("timestep = %d", timestep) self.logger.debug("freqstep = %d", freqstep) self.logger.debug("half_window = %d", half_window) self.logger.debug("threshold = %f", threshold) self.logger.debug("demixdir = %s", demixdir) self.logger.debug("skymodel = %s", skymodel) self.logger.debug("db_host= %s", db_host) # Initialise environment self.environment = read_initscript(self.logger, initscript) # Create working directory, if it does not yet exist. if not os.path.exists(working_dir): os.makedirs(working_dir) # The output file names are based on the input filename, however # they must be created in ``working_dir``. filename = os.path.split(infile)[1] outfile = os.path.join(working_dir, filename) key = os.path.join(working_dir, 'key_' + filename) mixingtable = os.path.join(working_dir, 'mixing_' + filename) basename = outfile.replace('_uv.MS', '') + '_' # If needed, run NDPPP to preflag input file out to demix.MS t = pt.table(infile) shp = t.getcell("DATA", 0).shape t = 0 mstarget = outfile.replace('uv', target) if os.system('rm -f -r ' + mstarget) != 0: return 1 if (shp[0] == 64 or shp[0] == 128 or shp[0] == 256): f = open(basename + 'NDPPP_dmx.parset', 'w') f.write('msin = %s\n' % infile) f.write('msin.autoweight = True\n') f.write('msin.startchan = nchan/32\n') f.write('msin.nchan = 30*nchan/32\n') f.write('msout = %s\n' % mstarget) f.write('steps=[preflag]\n') f.write('preflag.type=preflagger\n') f.write('preflag.corrtype=auto\n') f.close() self.logger.info("Starting NDPPP demix ...") if not self._execute(['NDPPP', basename + 'NDPPP_dmx.parset']): return 1 else: if infile == mstarget: self.logger.error("MS-file %s already exists" % mstarget) return 1 else: self.logger.info("Copying MS-file: %s --> %s" % (infile, mstarget)) if os.system('cp -r ' + infile + ' ' + mstarget) != 0: return 1 # Use heuristics to get a list of A-team sources that may need # to be removed. If the user specified a list of candidate A-team # sources to remove, then determine the intersection of both lists. # Otherwise just use the list obtained from heuristics. ateam_list = getAteamList(infile, outerDistance=2.e4, elLimit=5., verbose=self.logger.isEnabledFor( logging.DEBUG)) self.logger.debug("getAteamList returned: %s" % ateam_list) if remove: remove = list(set(remove).intersection(ateam_list)) else: remove = ateam_list self.logger.info("Removing %d target(s) from %s: %s" % (len(remove), mstarget, ', '.join(remove))) spc.shiftphasecenter(mstarget, remove, freqstep, timestep) # for each source to remove, and the target, do a freq/timesquash # NDPPP removeplustarget = numpy.append(remove, target) avgoutnames = [] for rem in removeplustarget: if os.system('rm -f ' + basename + 'dmx_avg.parset') != 0: return 1 f = open(basename + 'dmx_avg.parset', 'w') msin = outfile.replace('uv', rem) f.write('msin = %s\n' % msin) msout = msin.replace('.MS', '_avg.MS') f.write('msout = %s\n' % msout) f.write('steps=[avg]\n') f.write('avg.type = averager\n') f.write('avg.timestep = %d\n' % timestep) f.write('avg.freqstep = %d\n' % freqstep) f.close() self.logger.debug("Squashing %s to %s" % (msin, msout)) if os.system('rm -f -r ' + msout) != 0: return 1 if not self._execute(['NDPPP', basename + 'dmx_avg.parset']): return 1 # Form avg output names. msin = outfile.replace('uv', rem) msout = msin.replace('.MS', '_avg.MS') avgoutnames.append(msout) msdem = msin.replace('.MS', '_avg_dem.MS') if os.system('rm -f -r ' + msdem) != 0: return 1 self.logger.info("Starting the demixing algorithm") dmx.demixing(mstarget, mixingtable, avgoutnames, freqstep, timestep, 4) self.logger.info("Finished the demixing algorithm") # # run BBS on the demixed measurement sets # self.logger.info("Starting BBS run on demixed measurement sets") for i in remove: self.logger.info("Processing %s ..." % i) msin = outfile.replace('uv', i) msout = msin.replace('.MS', '_avg_dem.MS') vds_file = basename + i + '.vds' gds_file = basename + i + '.gds' self.logger.info("Creating vds & gds files...") if os.system('rm -f ' + vds_file + gds_file) != 0: return 1 if not self._execute(['makevds', clusterdesc, msout, vds_file ]): return 1 if not self._execute(['combinevds', gds_file, vds_file]): return 1 self.logger.info("Starting first calibration run") command = [ 'calibrate', '-f', '--key', key, '--cluster-desc', clusterdesc, '--db', db_host, '--db-user', 'postgres', gds_file, os.path.join(demixdir, 'bbs_' + i + '.parset'), skymodel, working_dir ] if not self._execute(command): return 1 self.logger.info("Generating smoothed instrument model") input_parmdb = os.path.join(msout, 'instrument') output_parmdb = os.path.join(msout, 'instrument_smoothed') # smoothparmdb indirectly creates a subprocess, so we must # make sure that the correct environment is set-up here. env = os.environ os.environ = self.environment smdx.smoothparmdb(input_parmdb, output_parmdb, half_window, threshold) os.environ = env self.logger.info("Starting second calibration run, " "using smoothed instrument model") command = [ 'calibrate', '--clean', '--skip-sky-db', '--skip-instrument-db', '--instrument-name', 'instrument_smoothed', '--key', key, '--cluster-desc', clusterdesc, '--db', db_host, '--db-user', 'postgres', gds_file, os.path.join(demixdir, 'bbs_' + i + '_smoothcal.parset'), skymodel, working_dir ] if not self._execute(command): return 1 # Form the list of input files and subtract. self.logger.info("Subtracting removed sources from the data ...") demfiles = [ outfile.replace('uv', rem + '_avg_dem') for rem in remove ] sfa.subtract_from_averaged(mstarget.replace('.MS', '_avg.MS'), mixingtable, demfiles, mstarget.replace('.MS', '_sub.MS')) # We're done. return 0
def run(self, infile, working_dir, initscript, remove, target, clusterdesc, timestep, freqstep, half_window, threshold, demixdir, skymodel, db_host): with log_time(self.logger): if os.path.exists(infile): self.logger.info("Started processing %s" % infile) else: self.logger.error("Dataset %s does not exist" % infile) return 1 self.logger.debug("infile = %s", infile) self.logger.debug("working_dir = %s", working_dir) self.logger.debug("initscript = %s", initscript) self.logger.debug("remove = %s", remove) self.logger.debug("target = %s", target) self.logger.debug("clusterdesc = %s", clusterdesc) self.logger.debug("timestep = %d", timestep) self.logger.debug("freqstep = %d", freqstep) self.logger.debug("half_window = %d", half_window) self.logger.debug("threshold = %f", threshold) self.logger.debug("demixdir = %s", demixdir) self.logger.debug("skymodel = %s", skymodel) self.logger.debug("db_host= %s", db_host) # Initialise environment self.environment = read_initscript(self.logger, initscript) # Create working directory, if it does not yet exist. if not os.path.exists(working_dir): os.makedirs(working_dir) # The output file names are based on the input filename, however # they must be created in ``working_dir``. filename = os.path.split(infile)[1] outfile = os.path.join(working_dir, filename) key = os.path.join(working_dir, 'key_' + filename) mixingtable = os.path.join(working_dir, 'mixing_' + filename) basename = outfile.replace('_uv.MS', '') + '_' # If needed, run NDPPP to preflag input file out to demix.MS t = pt.table(infile) shp = t.getcell("DATA", 0).shape t = 0 mstarget = outfile.replace('uv',target) if os.system ('rm -f -r ' + mstarget) != 0: return 1 if (shp[0] == 64 or shp[0] == 128 or shp[0] == 256): f=open(basename + 'NDPPP_dmx.parset','w') f.write('msin = %s\n' % infile) f.write('msin.autoweight = True\n') f.write('msin.startchan = nchan/32\n') f.write('msin.nchan = 30*nchan/32\n') f.write('msout = %s\n' % mstarget) f.write('steps=[preflag]\n') f.write('preflag.type=preflagger\n') f.write('preflag.corrtype=auto\n') f.close() self.logger.info("Starting NDPPP demix ...") if not self._execute(['NDPPP', basename + 'NDPPP_dmx.parset']): return 1 else: if infile == mstarget: self.logger.error("MS-file %s already exists" % mstarget) return 1 else: self.logger.info( "Copying MS-file: %s --> %s" % (infile, mstarget) ) if os.system ('cp -r ' + infile + ' ' + mstarget) != 0: return 1 # Use heuristics to get a list of A-team sources that may need # to be removed. If the user specified a list of candidate A-team # sources to remove, then determine the intersection of both lists. # Otherwise just use the list obtained from heuristics. ateam_list = getAteamList( infile, outerDistance=2.e4, elLimit=5., verbose=self.logger.isEnabledFor(logging.DEBUG) ) self.logger.debug("getAteamList returned: %s" % ateam_list) if remove: remove = list(set(remove).intersection(ateam_list)) else: remove = ateam_list self.logger.info("Removing %d target(s) from %s: %s" % (len(remove), mstarget, ', '.join(remove))) spc.shiftphasecenter (mstarget, remove, freqstep, timestep) # for each source to remove, and the target, do a freq/timesquash # NDPPP removeplustarget = numpy.append (remove, target) avgoutnames = [] for rem in removeplustarget: if os.system ('rm -f ' + basename + 'dmx_avg.parset') != 0: return 1 f=open(basename + 'dmx_avg.parset','w') msin = outfile.replace('uv',rem) f.write('msin = %s\n' % msin) msout = msin.replace ('.MS','_avg.MS') f.write('msout = %s\n' % msout) f.write('steps=[avg]\n') f.write('avg.type = averager\n') f.write('avg.timestep = %d\n' % timestep) f.write('avg.freqstep = %d\n' % freqstep) f.close() self.logger.debug("Squashing %s to %s" % (msin, msout)) if os.system ('rm -f -r '+msout) != 0: return 1 if not self._execute(['NDPPP', basename + 'dmx_avg.parset']): return 1 # Form avg output names. msin = outfile.replace('uv',rem) msout = msin.replace ('.MS','_avg.MS') avgoutnames.append (msout) msdem = msin.replace ('.MS','_avg_dem.MS') if os.system ('rm -f -r '+msdem) != 0: return 1 self.logger.info("Starting the demixing algorithm") dmx.demixing (mstarget, mixingtable, avgoutnames, freqstep, timestep, 4) self.logger.info("Finished the demixing algorithm") # # run BBS on the demixed measurement sets # self.logger.info("Starting BBS run on demixed measurement sets") for i in remove: self.logger.info("Processing %s ..." % i) msin = outfile.replace('uv', i) msout = msin.replace ('.MS','_avg_dem.MS') vds_file = basename + i +'.vds' gds_file = basename + i +'.gds' self.logger.info("Creating vds & gds files...") if os.system ('rm -f '+ vds_file + gds_file) != 0: return 1 if not self._execute(['makevds', clusterdesc, msout, vds_file]): return 1 if not self._execute(['combinevds', gds_file, vds_file]): return 1 self.logger.info("Starting first calibration run") command=['calibrate', '-f', '--key', key, '--cluster-desc', clusterdesc, '--db', db_host, '--db-user', 'postgres', gds_file, os.path.join(demixdir, 'bbs_'+i+'.parset'), skymodel, working_dir] if not self._execute(command): return 1 self.logger.info("Generating smoothed instrument model") input_parmdb = os.path.join(msout, 'instrument') output_parmdb= os.path.join(msout, 'instrument_smoothed') # smoothparmdb indirectly creates a subprocess, so we must # make sure that the correct environment is set-up here. env = os.environ os.environ = self.environment smdx.smoothparmdb(input_parmdb, output_parmdb, half_window, threshold) os.environ = env self.logger.info("Starting second calibration run, " "using smoothed instrument model") command=['calibrate', '--clean', '--skip-sky-db', '--skip-instrument-db', '--instrument-name', 'instrument_smoothed', '--key', key, '--cluster-desc', clusterdesc, '--db', db_host, '--db-user', 'postgres', gds_file, os.path.join(demixdir, 'bbs_'+i+'_smoothcal.parset'), skymodel, working_dir] if not self._execute(command): return 1 # Form the list of input files and subtract. self.logger.info("Subtracting removed sources from the data ...") demfiles = [outfile.replace('uv',rem+'_avg_dem') for rem in remove] sfa.subtract_from_averaged (mstarget.replace('.MS','_avg.MS'), mixingtable, demfiles, mstarget.replace('.MS','_sub.MS')) # We're done. return 0
def run( self, executable, initscript, infile, key, db_name, db_user, db_host ): # executable: path to KernelControl executable # initscript: path to lofarinit.sh # infile: MeasurementSet for processing # key, db_name, db_user, db_host: database connection parameters # ---------------------------------------------------------------------- with log_time(self.logger): if os.path.exists(infile): self.logger.info("Processing %s" % (infile)) else: self.logger.error("Dataset %s does not exist" % (infile)) return 1 # Build a configuration parset specifying database parameters # for the kernel # ------------------------------------------------------------------ self.logger.debug("Setting up kernel parset") filesystem = "%s:%s" % (os.uname()[1], get_mountpoint(infile)) fd, parset_filename = mkstemp() kernel_parset = Parset() for key, value in { "ObservationPart.Filesystem": filesystem, "ObservationPart.Path": infile, "BBDB.Key": key, "BBDB.Name": db_name, "BBDB.User": db_user, "BBDB.Host": db_host, "ParmLog": "", "ParmLoglevel": "", "ParmDB.Sky": infile + ".sky", "ParmDB.Instrument": infile + ".instrument" }.iteritems(): kernel_parset.add(key, value) kernel_parset.writeFile(parset_filename) os.close(fd) self.logger.debug("Parset written to %s" % (parset_filename,)) # Run the kernel # Catch & log output from the kernel logger and stdout # ------------------------------------------------------------------ working_dir = mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) env = read_initscript(self.logger, initscript) try: cmd = [executable, parset_filename, "0"] self.logger.debug("Executing BBS kernel") with CatchLog4CPlus( working_dir, self.logger.name + "." + os.path.basename(infile), os.path.basename(executable), ): bbs_kernel_process = Popen( cmd, stdout=PIPE, stderr=PIPE, cwd=working_dir ) sout, serr = bbs_kernel_process.communicate() log_process_output("BBS kernel", sout, serr, self.logger) if bbs_kernel_process.returncode != 0: raise CalledProcessError( bbs_kernel_process.returncode, executable ) except CalledProcessError, e: self.logger.error(str(e)) return 1 finally:
def go(self): self.logger.info("Starting BBS run") super(bbs, self).go() # Generate source and parameter databases for all input data # ---------------------------------------------------------------------- inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['parmdbm'] inputs['working_directory'] = self.config.get( "DEFAULT", "default_working_directory" ) inputs['mapfile'] = self.task_definitions.get('parmdb','mapfile') inputs['suffix'] = ".instrument" outputs = LOFARoutput(self.inputs) if self.cook_recipe('parmdb', inputs, outputs): self.logger.warn("parmdb reports failure") return 1 inputs['args'] = self.inputs['args'] inputs['executable'] = self.inputs['makesourcedb'] inputs['skymodel'] = self.inputs['skymodel'] inputs['mapfile'] = self.task_definitions.get('sourcedb','mapfile') inputs['suffix'] = ".sky" outputs = LOFARoutput(self.inputs) if self.cook_recipe('sourcedb', inputs, outputs): self.logger.warn("sourcedb reports failure") return 1 # Build a GVDS file describing all the data to be processed # ---------------------------------------------------------------------- self.logger.debug("Building VDS file describing all data for BBS") vds_file = os.path.join( self.config.get("layout", "job_directory"), "vds", "bbs.gvds" ) inputs = LOFARinput(self.inputs) inputs['args'] = self.inputs['args'] inputs['gvds'] = vds_file inputs['unlink'] = False inputs['makevds'] = self.inputs['makevds'] inputs['combinevds'] = self.inputs['combinevds'] inputs['nproc'] = self.inputs['nproc'] inputs['directory'] = os.path.dirname(vds_file) outputs = LOFARoutput(self.inputs) if self.cook_recipe('vdsmaker', inputs, outputs): self.logger.warn("vdsmaker reports failure") return 1 self.logger.debug("BBS GVDS is %s" % (vds_file,)) # Iterate over groups of subbands divided up for convenient cluster # procesing -- ie, no more than nproc subbands per compute node # ---------------------------------------------------------------------- for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])): # to_process is a list of (host, filename, vds) tuples # ------------------------------------------------------------------ hosts, ms_names, vds_files = map(list, zip(*to_process)) # The BBS session database should be cleared for our key # ------------------------------------------------------------------ self.logger.debug( "Cleaning BBS database for key %s" % (self.inputs["key"]) ) with closing( psycopg2.connect( host=self.inputs["db_host"], user=self.inputs["db_user"], database=self.inputs["db_name"] ) ) as db_connection: db_connection.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT ) with closing(db_connection.cursor()) as db_cursor: db_cursor.execute( "DELETE FROM blackboard.session WHERE key=%s", (self.inputs["key"],) ) # BBS GlobalControl requires a GVDS file describing all the data # to be processed. We assemble that from the separate parts # already available on disk. # ------------------------------------------------------------------ self.logger.debug("Building VDS file describing data for BBS run") vds_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__),)) vds_file = os.path.join(vds_dir, "bbs.gvds") combineproc = utilities.spawn_process( [ self.inputs['combinevds'], vds_file, ] + vds_files, self.logger ) sout, serr = combineproc.communicate() log_process_output(self.inputs['combinevds'], sout, serr, self.logger) if combineproc.returncode != 0: raise subprocess.CalledProcessError( combineproc.returncode, command ) # Construct a parset for BBS GlobalControl by patching the GVDS # file and database information into the supplied template # ------------------------------------------------------------------ self.logger.debug("Building parset for BBS control") bbs_parset = utilities.patch_parset( self.inputs['parset'], { 'Observation': vds_file, 'BBDB.Key': self.inputs['key'], 'BBDB.Name': self.inputs['db_name'], 'BBDB.User': self.inputs['db_user'], 'BBDB.Host': self.inputs['db_host'], # 'BBDB.Port': self.inputs['db_name'], } ) self.logger.debug("BBS control parset is %s" % (bbs_parset,)) try: # When one of our processes fails, we set the killswitch. # Everything else will then come crashing down, rather than # hanging about forever. # -------------------------------------------------------------- self.killswitch = threading.Event() self.killswitch.clear() signal.signal(signal.SIGTERM, self.killswitch.set) # GlobalControl runs in its own thread # -------------------------------------------------------------- run_flag = threading.Event() run_flag.clear() bbs_control = threading.Thread( target=self._run_bbs_control, args=(bbs_parset, run_flag) ) bbs_control.start() run_flag.wait() # Wait for control to start before proceeding # We run BBS KernelControl on each compute node by directly # invoking the node script using SSH # Note that we use a job_server to send out job details and # collect logging information, so we define a bunch of # ComputeJobs. However, we need more control than the generic # ComputeJob.dispatch method supplies, so we'll control them # with our own threads. # -------------------------------------------------------------- command = "python %s" % (self.__file__.replace('master', 'nodes')) env = { "LOFARROOT": utilities.read_initscript(self.logger, self.inputs['initscript'])["LOFARROOT"], "PYTHONPATH": self.config.get('deploy', 'engine_ppath'), "LD_LIBRARY_PATH": self.config.get('deploy', 'engine_lpath') } jobpool = {} bbs_kernels = [] with job_server(self.logger, jobpool, self.error) as (jobhost, jobport): self.logger.debug("Job server at %s:%d" % (jobhost, jobport)) for job_id, details in enumerate(to_process): host, file, vds = details jobpool[job_id] = ComputeJob( host, command, arguments=[ self.inputs['kernel_exec'], self.inputs['initscript'], file, self.inputs['key'], self.inputs['db_name'], self.inputs['db_user'], self.inputs['db_host'] ] ) bbs_kernels.append( threading.Thread( target=self._run_bbs_kernel, args=(host, command, env, job_id, jobhost, str(jobport) ) ) ) self.logger.info("Starting %d threads" % len(bbs_kernels)) [thread.start() for thread in bbs_kernels] self.logger.debug("Waiting for all kernels to complete") [thread.join() for thread in bbs_kernels] # When GlobalControl finishes, our work here is done # ---------------------------------------------------------- self.logger.info("Waiting for GlobalControl thread") bbs_control.join() finally: os.unlink(bbs_parset) shutil.rmtree(vds_dir) if self.killswitch.isSet(): # If killswitch is set, then one of our processes failed so # the whole run is invalid # ---------------------------------------------------------- return 1 return 0