def _run_rficonsole(self, rficonsole_executable, time_slice_dir, time_slices): """ _run_rficonsole runs the rficonsole application on the supplied timeslices in time_slices. """ # loop all measurement sets rfi_temp_dir = os.path.join(time_slice_dir, "rfi_temp_dir") create_directory(rfi_temp_dir) try: rfi_console_proc_group = SubProcessGroup(self.logger) for time_slice in time_slices: # Each rfi console needs own working space for temp files temp_slice_path = os.path.join(rfi_temp_dir, os.path.basename(time_slice)) create_directory(temp_slice_path) # construct copy command self.logger.info(time_slice) command = [rficonsole_executable, "-indirect-read", time_slice] self.logger.info("executing rficonsole command: {0}".format( " ".join(command))) # Add the command to the process group rfi_console_proc_group.run(command, cwd=temp_slice_path) # wait for all to finish if rfi_console_proc_group.wait_for_finish() != None: raise Exception("an rfi_console_proc_group run failed!") finally: shutil.rmtree(rfi_temp_dir)
def test_start_without_jobs(self): process_group = SubProcessGroup(polling_interval=10) start_time = time.time() process_group.wait_for_finish() end_time = time.time() # The wait should complete without a polling interfal self.assertTrue((end_time - start_time) < 10)
def test_start_without_jobs(self): process_group = SubProcessGroup(polling_interval=1) # wait for 5 seconds start_time = time.time() process_group.wait_for_finish() end_time = time.time() # The wait should complete without a polling interfal self.assertTrue((end_time - start_time) < 1)
def add_beam_tables(self, time_slices_path_list): beamtable_proc_group = SubProcessGroup(self.logger) for ms_path in time_slices_path_list: self.logger.debug("makebeamtables start") cmd_string = "makebeamtables ms={0} overwrite=true".format(ms_path) self.logger.debug(cmd_string) beamtable_proc_group.run(cmd_string) if beamtable_proc_group.wait_for_finish() != None: raise Exception("an makebeamtables run failed!") self.logger.debug("makebeamtables finished")
def dispatch(self, logger, config, limiter, id, jobhost, jobport, error, killswitch): """ Dispatch this job to the relevant compute node. Note that error is an instance of threading.Event, which will be set if the remote job fails for some reason. """ self.id = id limiter[self.host].acquire() # Start the time after we aquire the lock! time_info_start = time.time() try: if killswitch.isSet(): logger.debug("Shutdown in progress: not starting remote job") self.results['returncode'] = 1 error.set() return 1 environment = { "PATH": os.environ.get('PATH'), "PYTHONPATH": os.environ.get('PYTHONPATH'), "LD_LIBRARY_PATH": os.environ.get('LD_LIBRARY_PATH'), "LOFARROOT" : os.environ.get('LOFARROOT'), "LOFARENV" : os.environ.get('LOFARENV',''), "QUEUE_PREFIX" : os.environ.get('QUEUE_PREFIX','') } if "cores" in self.resources: environment["OMP_NUM_THREADS"] = str(self.resources["cores"]) cmdarray = run_remote_command( config, logger, self.host, self.command, environment, arguments = [id, jobhost, jobport], resources = self.resources ) # Run and wait for process to finish. pg = SubProcessGroup(logger=logger, killSwitch=killswitch) pg.run(cmdarray) job_successful = (pg.wait_for_finish() is None) except Exception, e: logger.exception("Failed to run remote process %s (%s)" % (self.command, str(e))) self.results['returncode'] = 1 error.set() return 1
def test_alternating_output(self): process_group = SubProcessGroup(polling_interval=1) # print a lot of numbers cmd = '%s/output_stderr_stdout.sh' % (os.path.dirname(__file__) or ".",) start_time = time.time() # Start it multiple times for idx in range(2): process_group.run(cmd) process_group.wait_for_finish() end_time = time.time() self.assertTrue((end_time - start_time) < 1)
def run(self, bbs_executable, parset, ms_list_path, parmdb_list_path, sky_list_path): """ imager_bbs functionality. Called by framework performing all the work """ self.logger.debug("Starting imager_bbs Node") # ********************************************************************* # 1. Load mapfiles # read in the mapfiles to data maps: The master recipe added the single # path to a mapfilem which allows usage of default data methods # (load_data_map) # TODO: Datamap ms_map = MultiDataMap.load(ms_list_path) parmdb_map = MultiDataMap.load(parmdb_list_path) sky_list = MultiDataMap.load(sky_list_path) source_db = sky_list[0].file[0] # the sourcedb is the first file entry try: bbs_process_group = SubProcessGroup(self.logger, self.resourceMonitor) # ***************************************************************** # 2. start the bbs executable with data for (measurement_set, parmdm) in zip(ms_map[0].file, parmdb_map[0].file): command = [ bbs_executable, "--sourcedb={0}".format(source_db), "--parmdb={0}".format(parmdm) , measurement_set, parset] self.logger.info("Executing bbs command: {0}".format(" ".join( command))) bbs_process_group.run(command) # ***************************************************************** # 3. check status of the processes if bbs_process_group.wait_for_finish() != None: self.logger.error( "Failed bbs run detected Aborting") return 1 # If bbs failed we need to abort: the concat # is now corrupt except OSError, exception: self.logger.error("Failed to execute bbs: {0}".format(str( exception))) return 1
def test_limit_number_of_proc(self): process_group = SubProcessGroup(polling_interval=1) # wait for 2 seconds cmd = "sleep 2" start_time = time.time() # Quickly start a large number of commands, assur for idx in range(10): process_group.run(cmd) # if there is no serialization the test would take about 5 seconds # with serialization i will take at a minimum 10 second, use 8 seconds # to have some safety from rounding errors process_group.wait_for_finish() end_time = time.time() self.assertTrue((end_time - start_time) > 3)
def run(self, bbs_executable, parset, ms_list_path, parmdb_list_path, sky_list_path): """ imager_bbs functionality. Called by framework performing all the work """ self.logger.debug("Starting imager_bbs Node") # ********************************************************************* # 1. Load mapfiles # read in the mapfiles to data maps: The master recipe added the single # path to a mapfilem which allows usage of default data methods # (load_data_map) # TODO: Datamap ms_map = MultiDataMap.load(ms_list_path) parmdb_map = MultiDataMap.load(parmdb_list_path) sky_list = MultiDataMap.load(sky_list_path) source_db = sky_list[0].file[0] # the sourcedb is the first file entry try: bbs_process_group = SubProcessGroup(self.logger, self.resourceMonitor) # ***************************************************************** # 2. start the bbs executable with data for (measurement_set, parmdm) in zip(ms_map[0].file, parmdb_map[0].file): command = [ bbs_executable, "--sourcedb={0}".format(source_db), "--parmdb={0}".format(parmdm), measurement_set, parset ] self.logger.info("Executing bbs command: {0}".format( " ".join(command))) bbs_process_group.run(command) # ***************************************************************** # 3. check status of the processes if bbs_process_group.wait_for_finish() != None: self.logger.error("Failed bbs run detected Aborting") return 1 # If bbs failed we need to abort: the concat # is now corrupt except OSError as exception: self.logger.error("Failed to execute bbs: {0}".format( str(exception))) return 1 return 0
def dispatch(self, logger, config, limiter, id, jobhost, jobport, error, killswitch): """ Dispatch this job to the relevant compute node. Note that error is an instance of threading.Event, which will be set if the remote job fails for some reason. """ self.id = id limiter[self.host].acquire() # Start the time after we aquire the lock! time_info_start = time.time() try: if killswitch.isSet(): logger.debug("Shutdown in progress: not starting remote job") self.results['returncode'] = 1 error.set() return 1 cmdarray = run_remote_command( config, logger, self.host, self.command, { "PATH": os.environ.get('PATH'), "PYTHONPATH": os.environ.get('PYTHONPATH'), "LD_LIBRARY_PATH": os.environ.get('LD_LIBRARY_PATH'), "LOFARROOT": os.environ.get('LOFARROOT'), "LOFARENV": os.environ.get('LOFARENV', ''), "QUEUE_PREFIX": os.environ.get('QUEUE_PREFIX', '') }, arguments=[id, jobhost, jobport], resources=self.resources) # Run and wait for process to finish. pg = SubProcessGroup(logger=logger, killSwitch=killswitch) pg.run(cmdarray) job_successful = (pg.wait_for_finish() is None) except Exception, e: logger.exception("Failed to run remote process %s (%s)" % (self.command, str(e))) self.results['returncode'] = 1 error.set() return 1
def test_fd_bigger_than_1024(self): process_group = SubProcessGroup(polling_interval=1, max_concurrent_processes=1000) cmd = "sleep 2" for idx in range(513): # each process uses 2 fds, so we only need 513 processes to ensure fds > 1024 process_group.run(cmd) process_group.wait_for_finish()
def run_rficonsole(rficonsole_executable, temp_dir, input_ms_list, logger, resourceMonitor): """ _run_rficonsole runs the rficonsole application on the supplied timeslices in time_slices. This functionality has also been implemented in BBS. """ # loop all measurement sets rfi_temp_dir = os.path.join(temp_dir, "rfi_temp_dir") create_directory(rfi_temp_dir) try: rfi_console_proc_group = SubProcessGroup(logger=logger, usageStats=resourceMonitor) for time_slice in input_ms_list: # Each rfi console needs own working space for temp files temp_slice_path = os.path.join(rfi_temp_dir, os.path.basename(time_slice)) create_directory(temp_slice_path) # construct copy command logger.info(time_slice) command = [rficonsole_executable, "-indirect-read", time_slice] logger.info("executing rficonsole command: {0}".format( " ".join(command))) # Add the command to the process group rfi_console_proc_group.run(command, cwd = temp_slice_path) # wait for all to finish if rfi_console_proc_group.wait_for_finish() != None: raise Exception("an rfi_console_proc_group run failed!") finally: shutil.rmtree(rfi_temp_dir)
def run_rficonsole(rficonsole_executable, temp_dir, input_ms_list, logger, resourceMonitor): """ _run_rficonsole runs the rficonsole application on the supplied timeslices in time_slices. This functionality has also been implemented in BBS. """ # loop all measurement sets rfi_temp_dir = os.path.join(temp_dir, "rfi_temp_dir") create_directory(rfi_temp_dir) try: rfi_console_proc_group = SubProcessGroup(logger=logger, usageStats=resourceMonitor) for time_slice in input_ms_list: # Each rfi console needs own working space for temp files temp_slice_path = os.path.join(rfi_temp_dir, os.path.basename(time_slice)) create_directory(temp_slice_path) # construct copy command logger.info(time_slice) command = [rficonsole_executable, "-indirect-read", time_slice] logger.info("executing rficonsole command: {0}".format( " ".join(command))) # Add the command to the process group rfi_console_proc_group.run(command, cwd=temp_slice_path) # wait for all to finish if rfi_console_proc_group.wait_for_finish() != None: raise Exception("an rfi_console_proc_group run failed!") finally: shutil.rmtree(rfi_temp_dir)
def _run_rficonsole(self, rficonsole_executable, time_slice_dir, time_slices): """ _run_rficonsole runs the rficonsole application on the supplied timeslices in time_slices. """ # loop all measurement sets rfi_temp_dir = os.path.join(time_slice_dir, "rfi_temp_dir") create_directory(rfi_temp_dir) try: rfi_console_proc_group = SubProcessGroup(self.logger) for time_slice in time_slices: # Each rfi console needs own working space for temp files temp_slice_path = os.path.join(rfi_temp_dir, os.path.basename(time_slice)) create_directory(temp_slice_path) # construct copy command self.logger.info(time_slice) command = [rficonsole_executable, "-indirect-read", time_slice] self.logger.info("executing rficonsole command: {0}".format( " ".join(command))) # Add the command to the process group rfi_console_proc_group.run(command, cwd = temp_slice_path) # wait for all to finish if rfi_console_proc_group.wait_for_finish() != None: raise Exception("an rfi_console_proc_group run failed!") finally: shutil.rmtree(rfi_temp_dir)
def test_alternating_output(self): process_group = SubProcessGroup(polling_interval=10) # print a lot of numbers cmd = '%s/output_stderr_stdout.sh' % (os.path.dirname(__file__) or ".",) start_time = time.time() # Start it multiple times for idx in range(2): process_group.run(cmd) process_group.wait_for_finish() end_time = time.time() self.assertTrue((end_time - start_time) < 10)
if os.path.exists(meta_dir) and os.path.exists(concat_ms): self.logger.info("Copy meta information to output measurementset") # Clear possible old data, allows for rerun of the pipeline # if needed. if os.path.exists(meta_dir_target): shutil.rmtree(meta_dir_target) shutil.copytree(meta_dir, meta_dir_target) # ***************************************************************** # 4 Copy the measurement set to the output directory # use msselect to copy all the data in the measurement sets cmd_string = "{0} in={1} out={2} baseline=* deep=True".format( msselect_executable, concat_ms, correlated_output_location) msselect_proc_group = SubProcessGroup(self.logger) msselect_proc_group.run(cmd_string) if msselect_proc_group.wait_for_finish() != None: self.logger.error("failed copy of measurmentset to output dir") raise Exception("an MSselect run failed!") self.outputs["hdf5"] = "succes" self.outputs["image"] = output_image self.outputs["correlated"] = correlated_output_location return 0 if __name__ == "__main__":
def dispatch(self, logger, config, limiter, id, jobhost, jobport, error, killswitch): """ Dispatch this job to the relevant compute node. Note that error is an instance of threading.Event, which will be set if the remote job fails for some reason. """ self.id = id limiter[self.host].acquire() # Start the time after we aquire the lock! time_info_start = time.time() try: if killswitch.isSet(): logger.debug("Shutdown in progress: not starting remote job") self.results['returncode'] = 1 error.set() return 1 environment = { "PATH": os.environ.get('PATH'), "PYTHONPATH": os.environ.get('PYTHONPATH'), "LD_LIBRARY_PATH": os.environ.get('LD_LIBRARY_PATH'), "LOFARROOT": os.environ.get('LOFARROOT'), "LOFARENV": os.environ.get('LOFARENV', ''), "QUEUE_PREFIX": os.environ.get('QUEUE_PREFIX', '') } if "cores" in self.resources: environment["OMP_NUM_THREADS"] = str(self.resources["cores"]) cmdarray = run_remote_command(config, logger, self.host, self.command, environment, arguments=[id, jobhost, jobport], resources=self.resources) # Run and wait for process to finish. pg = SubProcessGroup(logger=logger, killSwitch=killswitch) pg.run(cmdarray) job_successful = (pg.wait_for_finish() is None) except Exception as e: logger.exception("Failed to run remote process %s (%s)" % (self.command, str(e))) self.results['returncode'] = 1 error.set() return 1 finally: limiter[self.host].release() if not job_successful: logger.error( "Remote process %s %s failed on %s" % \ (self.command, self.arguments, self.host) ) error.set() # after node returned. # add the duration of time_info_end = time.time() self.results["job_duration"] = str(time_info_end - time_info_start) self.results['returncode'] = 0 if job_successful else 1 logger.debug( "compute.dispatch results job {0}: {1}: {2}, {3}: {4} ".format( self.id, "job_duration", self.results["job_duration"], "returncode", self.results["returncode"])) return self.results["returncode"]
def _filter_bad_stations(self, time_slice_path_list, asciistat_executable, statplot_executable, msselect_executable): """ A Collection of scripts for finding and filtering of bad stations: 1. First a number of statistics with regards to the spread of the data is collected using the asciistat_executable. 2. Secondly these statistics are consumed by the statplot_executable which produces a set of bad stations. 3. In the final step the bad stations are removed from the dataset using ms select REF: http://www.lofar.org/wiki/lib/exe/fetch.php?media=msss:pandeymartinez-week9-v1p2.pdf """ # run asciistat to collect statistics about the ms self.logger.info("Filtering bad stations") self.logger.debug("Collecting statistical properties of input data") asciistat_output = [] asciistat_proc_group = SubProcessGroup(self.logger) for ms in time_slice_path_list: output_dir = ms + ".filter_temp" create_directory(output_dir) asciistat_output.append((ms, output_dir)) cmd_string = "{0} -i {1} -r {2}".format(asciistat_executable, ms, output_dir) asciistat_proc_group.run(cmd_string) if asciistat_proc_group.wait_for_finish() != None: raise Exception("an ASCIIStats run failed!") # Determine the station to remove self.logger.debug("Select bad stations depending on collected stats") asciiplot_output = [] asciiplot_proc_group = SubProcessGroup(self.logger) for (ms, output_dir) in asciistat_output: ms_stats = os.path.join( output_dir, os.path.split(ms)[1] + ".stats") cmd_string = "{0} -i {1} -o {2}".format(statplot_executable, ms_stats, ms_stats) asciiplot_output.append((ms, ms_stats)) asciiplot_proc_group.run(cmd_string) if asciiplot_proc_group.wait_for_finish() != None: raise Exception("an ASCIIplot run failed!") # remove the bad stations self.logger.debug("Use ms select to remove bad stations") msselect_output = {} msselect_proc_group = SubProcessGroup(self.logger) for ms, ms_stats in asciiplot_output: # parse the .tab file containing the bad stations station_to_filter = [] file_pointer = open(ms_stats + ".tab") for line in file_pointer.readlines(): # skip headed line if line[0] == "#": continue entries = line.split() # if the current station is bad (the last entry on the line) if entries[-1] == "True": # add the name of station station_to_filter.append(entries[1]) # if this measurement does not contain baselines to skip do not # filter and provide the original ms as output if len(station_to_filter) == 0: msselect_output[ms] = ms continue ms_output_path = ms + ".filtered" msselect_output[ms] = ms_output_path # use msselect to remove the stations from the ms msselect_baseline = "!{0}".format(",".join(station_to_filter)) cmd_string = "{0} in={1} out={2} baseline={3} deep={4}".format( msselect_executable, ms, ms_output_path, msselect_baseline, "False") msselect_proc_group.run(cmd_string) if msselect_proc_group.wait_for_finish() != None: raise Exception("an MSselect run failed!") filtered_list_of_ms = [] # The order of the inputs needs to be preserved when producing the # filtered output! for input_ms in time_slice_path_list: filtered_list_of_ms.append(msselect_output[input_ms]) return filtered_list_of_ms
def _filter_bad_stations(self, time_slice_path_list, asciistat_executable, statplot_executable, msselect_executable): """ A Collection of scripts for finding and filtering of bad stations: 1. First a number of statistics with regards to the spread of the data is collected using the asciistat_executable. 2. Secondly these statistics are consumed by the statplot_executable which produces a set of bad stations. 3. In the final step the bad stations are removed from the dataset using ms select REF: http://www.lofar.org/wiki/lib/exe/fetch.php?media=msss:pandeymartinez-week9-v1p2.pdf """ # run asciistat to collect statistics about the ms self.logger.info("Filtering bad stations") self.logger.debug("Collecting statistical properties of input data") asciistat_output = [] asciistat_proc_group = SubProcessGroup(self.logger) for ms in time_slice_path_list: output_dir = ms + ".filter_temp" create_directory(output_dir) asciistat_output.append((ms, output_dir)) cmd_string = "{0} -i {1} -r {2}".format(asciistat_executable, ms, output_dir) asciistat_proc_group.run(cmd_string) if asciistat_proc_group.wait_for_finish() != None: raise Exception("an ASCIIStats run failed!") # Determine the station to remove self.logger.debug("Select bad stations depending on collected stats") asciiplot_output = [] asciiplot_proc_group = SubProcessGroup(self.logger) for (ms, output_dir) in asciistat_output: ms_stats = os.path.join(output_dir, os.path.split(ms)[1] + ".stats") cmd_string = "{0} -i {1} -o {2}".format(statplot_executable, ms_stats, ms_stats) asciiplot_output.append((ms, ms_stats)) asciiplot_proc_group.run(cmd_string) if asciiplot_proc_group.wait_for_finish() != None: raise Exception("an ASCIIplot run failed!") # remove the bad stations self.logger.debug("Use ms select to remove bad stations") msselect_output = {} msselect_proc_group = SubProcessGroup(self.logger) for ms, ms_stats in asciiplot_output: # parse the .tab file containing the bad stations station_to_filter = [] file_pointer = open(ms_stats + ".tab") for line in file_pointer.readlines(): # skip headed line if line[0] == "#": continue entries = line.split() # if the current station is bad (the last entry on the line) if entries[-1] == "True": # add the name of station station_to_filter.append(entries[1]) # if this measurement does not contain baselines to skip do not # filter and provide the original ms as output if len(station_to_filter) == 0: msselect_output[ms] = ms continue ms_output_path = ms + ".filtered" msselect_output[ms] = ms_output_path # use msselect to remove the stations from the ms msselect_baseline = "!{0}".format(",".join(station_to_filter)) cmd_string = "{0} in={1} out={2} baseline={3} deep={4}".format( msselect_executable, ms, ms_output_path, msselect_baseline, "False") msselect_proc_group.run(cmd_string) if msselect_proc_group.wait_for_finish() != None: raise Exception("an MSselect run failed!") filtered_list_of_ms = [] # The order of the inputs needs to be preserved when producing the # filtered output! for input_ms in time_slice_path_list: filtered_list_of_ms.append(msselect_output[input_ms]) return filtered_list_of_ms
self.logger.info( "Copy meta information to output measurementset") # Clear possible old data, allows for rerun of the pipeline # if needed. if os.path.exists(meta_dir_target): shutil.rmtree(meta_dir_target) shutil.copytree(meta_dir, meta_dir_target) # ***************************************************************** # 4 Copy the measurement set to the output directory # use msselect to copy all the data in the measurement sets cmd_string = "{0} in={1} out={2} baseline=* deep=True".format( msselect_executable, concat_ms, correlated_output_location) msselect_proc_group = SubProcessGroup(self.logger) msselect_proc_group.run(cmd_string) if msselect_proc_group.wait_for_finish() != None: self.logger.error("failed copy of measurmentset to output dir") raise Exception("an MSselect run failed!") self.outputs["hdf5"] = "succes" self.outputs["image"] = output_image self.outputs["correlated"] = correlated_output_location return 0 if __name__ == "__main__": _JOBID, _JOBHOST, _JOBPORT = sys.argv[1:4]
def run(self, bbs_executable, parset, ms_list_path, parmdb_list_path, sky_list_path, concat_ms_path, major_cycle): """ selfcal_bbs functionality. Called by framework performing all the work """ self.logger.debug("Starting selfcal_bbs Node") # ********************************************************************* # 1. Load mapfiles # read in the mapfiles to data maps: The master recipe added the single # path to a mapfilem which allows usage of default data methods # (load_data_map) # TODO: Datamap ms_map = MultiDataMap.load(ms_list_path) parmdb_map = MultiDataMap.load(parmdb_list_path) sky_list = MultiDataMap.load(sky_list_path) source_db = sky_list[0].file[0] # the sourcedb is the first file entry try: bbs_process_group = SubProcessGroup(self.logger, self.resourceMonitor) # ***************************************************************** # 2. start the bbs executable with data # The data is located in multimaps. We need the first entry # TODO: THis is not 'nice' usage of the multimap for (measurement_set, parmdm) in zip(ms_map[0].file, parmdb_map[0].file): command = [ bbs_executable, "--sourcedb={0}".format(source_db), "--parmdb={0}".format(parmdm), measurement_set, parset ] self.logger.info("Executing bbs command: {0}".format( " ".join(command))) bbs_process_group.run(command) # ***************************************************************** # 3. check status of the processes if bbs_process_group.wait_for_finish() != None: self.logger.error("Failed bbs run detected Aborting") return 1 except OSError as exception: self.logger.error("Failed to execute bbs: {0}".format( str(exception))) return 1 # ********************************************************************* # 4. Concat in time after bbs calibration your MSs using # msconcat (pyrap.tables module) (added by N.Vilchez) # this step has te be performed on this location. because the bbs run # might add additional columns not present in the original ms # and therefore not produced in the concat done in the prepare phase # redmine issue #6021 pt.msconcat(ms_map[0].file, concat_ms_path, concatTime=True) # ********************************************************************* # 5. copy time slives directory to a new one # This is done for debugging purpose: The copy is not used for anything # The actual selfcal steps are done in place # (added by N.Vilchez) # THe save location is created relative to the concat.ms # we could also use the self.scratch_directory from the toplevel recipe # this would need an aditional ingredient # This is a 'debugging' step and should never ever cause a failure of \ # the pipeline try: working_dir = os.path.dirname(concat_ms_path) time_slice_dir = os.path.join(working_dir, 'time_slices') time_slice_copy_dir = os.path.join( working_dir, 'time_slices_cycle_{0}'.format(major_cycle)) cmd = "cp -r {0} {1}".format(time_slice_dir, time_slice_copy_dir) os.system(cmd) except: self.logger.warn( "Debug copy of temporary files failed: continue operations") pass # Do nothing return 0