def run_mpifxcorr(rootname, np, mpi=None, machine_file=None, mpifxcorr=None, input_file=None, timeout=None): machine_file = os.path.abspath(rootname + '.machine') input_file = os.path.abspath(rootname + '.input') if mpi == None: mpi = observation.mpi if mpifxcorr == None: correlator = observation.mpifxcorr if timeout == None: timeout = observation.mpifxcorr_timeout check_outfile(input_file) import difxlog as log command = mpi + ' -nolocal -np ' + str(np) + ' -machinefile ' +\ machine_file + ' ' + mpifxcorr + ' ' + input_file exec_time = get_parameter('EXECUTE TIME (SEC)', input_file) spawn(command, defreg, spawn_func, (time(), exec_time), timeout=120) log.info('Correlator Finished')
def spawn(command, reg=defreg, reclass=spawnClass, classobj=spawn_class_obj, timeout=None): """ command is the command to spawn reg is the regexp or list of regexp to pass to pexpect reclass is a class __init__ should take self, classobj and child run should take self and i it takes i and child as arguments and should return a nonzero value when the child returns EOF and 0 otherwise the nonzero value is returned. """ import difxlog as log if timeout == None: timeout = observation.spawn_timeout log.info('spawning ' + command) log.info('timeout ' + str(timeout) + 's') child = pspawn(command) cl = reclass(classobj, child) while 1: i = child.expect(reg, timeout) a = cl.run(i) if not a == 0: break return a
def spawn_func(i, child, funcobj): if i == 0: start_time, exec_time = funcobj log.debug(child.before) a = retimestep.match(child.before) if a: timestep = int(a.group(2)) realtime = int(100. * timestep / (time() - start_time)) log.info(''.join( ('Timestep ', a.group(2), ' / ', exec_time, '. ', str(timestep * 100 / int(exec_time)), '% completed in ', str(realtime), '% realtime.'))) return 0 # Could add some other regexps for detecting errors etc. # returning 2 or something like that if i == 1: return 1
def run(self, i): if i == 0: log.debug(self.child.before) a = retimestep.match(self.child.before) if a: this_time = time() this_timestep = float(a.group(2)) if self.last_timestep == -1: realtime = 0 time_elapsed = 0 current_realtime = 0 time_remaining = 0 else: realtime = (this_timestep / (this_time - self.start_time)) if realtime == 0: realtime = 1 time_elapsed = this_time - self.start_time current_realtime = self.int_time / (this_time - self.last_time) time_remaining = (self.exec_time - this_timestep) / realtime time_elapsed_str = "%3d:%02d:%02.0f" % df2hhms( (time_elapsed + 0.5) / 86400.) time_remaining_str = "%3d:%02d:%02.0f" % df2hhms( (time_remaining + 0.5) / 86400.) fullstring = "Completed %9.2f/%9.2fs(%2.0f%%)|Elapsed %s|Remaining %s|%3.0f%%" %\ (this_timestep, self.exec_time, 100.0 * this_timestep / self.exec_time, time_elapsed_str, time_remaining_str, 100.0 * current_realtime + 0.5) log.info(fullstring) self.last_time = this_time self.last_timestep = this_timestep return 0 # Could add some other regexps for detecting errors etc. # returning 2 or something like that if i == 1: return 1
def check_threads(machine_file, input_file): # for our datastream there is simply one process per line in the machine file # so we could simply use that to work out the number of processes. # however this is more thorough. log.info('checking how processes are assigned') #set nprocesses to 1 to account for master n_processes = 1 #find out the number of datastreams and add this to nprocesses active_datastreams = int(get_parameter('ACTIVE DATASTREAMS', input_file)) n_processes += active_datastreams #open machine file and thread file mf = open(machine_file, 'r') thread_file = get_parameter('CORE CONF FILENAME', input_file) tf = open(thread_file, 'r') #note which machine will be the manager manager = mf.next().strip() #note which machines will be the datastreams datastreams = [] for i in range(active_datastreams): datastreams.append(mf.next().strip()) #find out number of cores from first line of the threads file and work out #and work out which machines will be cores and how many threads will run on #each cores = [] threads = [] ncores = int(tf.next().split(':')[1].strip()) for i in range(ncores): try: threads.append(int(tf.next())) except StopIteration: raise RuntimeError, 'faulty .threads file' try: cores.append(mf.next().strip()) except StopIteration: log.warning( 'warning, mismatch in number of cores and number of machines in machine file' ) n_processes += sum(threads) log.info('machine file ' + machine_file) log.info('thread file ' + thread_file) log.info('n_processes ' + str(n_processes)) log.info('manager ' + manager) for i in range(active_datastreams): log.info('datastream ' + str(i + 1).rjust(4) + ' ' + datastreams[i]) for i in range(ncores): log.info('core ' + str(i + 1).rjust(4) + ' ' + cores[i].rjust(2) + ' ' + str(threads[i]).rjust(2) + ' thread(s)') return { 'n_processes': n_processes, 'manager': manager, 'datastreams': datastreams, 'cores': cores, 'threads': threads }
def check_threads(machine_file, input_file): log.info('checking how processes are assigned') #find out the number of datastreams and add this to nprocesses active_datastreams = int(get_parameter('ACTIVE DATASTREAMS', input_file)) #open machine file and thread file mf = open(machine_file, 'r') thread_file = get_parameter('CORE CONF FILENAME', input_file) tf = open(thread_file, 'r') # parse machine file mftable = [] for line in mf: if '#' in line: line = line.split('#')[0] if line == '': continue line = line.split() mfline = {} mfline['host'] = line[0] for entry in line[1:]: entry = entry.split('=') mfline[entry[0]] = int(entry[1]) mftable.append(mfline) #remove unused nodes and add up total number of processes np = 0 nodes = [] for host in mftable: node = {} node['host'] = host['host'] node['processes'] = [] if host.has_key('max_slots'): node['np'] = host['max_slots'] node['ncores'] = node['np'] np += node['np'] elif host.has_key('slots'): node['np'] = host['slots'] node['ncores'] = node['np'] np += node['np'] else: node['np'] = 1 node['ncores'] = node['np'] np += node['np'] if node['np'] > 0: nodes.append(node) print node['host'], str(node['np']), str(np) print nodes # note which machine will be the manager # note which machines will be the datastreams if not np > active_datastreams: raise RuntimeError, "Not enough available slots to run FXManager and datastreams" manager = None datastreams = [] datastreams_assigned = 0 while (datastreams_assigned < active_datastreams): for node in nodes: #will only run once if not manager: manager = node['host'] node['ncores'] -= 1 node['processes'].append('FXManager') else: if datastreams_assigned < active_datastreams: datastreams_assigned += 1 datastreams.append(node['host']) node['ncores'] -= 1 node['processes'].append('datastream ' + str(datastreams_assigned)) log.info('Machine file ' + machine_file) log.info('Thread file ' + thread_file) log.info('np ' + str(np)) log.info('Hosts:') for node in nodes: log.info(" " + node['host'] + ':') for process in node['processes']: log.info(" " + process) if node['ncores'] > 0: log.info(" + " + str(node['ncores']) + " core(s)") log.info('') return np
def run_mpifxcorr(rootname, np, mpi=None, machine_file=None, mpifxcorr=None, input_file=None, timeout=None): # machine_file = os.path.abspath(rootname + '.machine') # input_file = os.path.abspath(rootname + '.input') if mpi == None: mpi = observation.mpi if mpifxcorr == None: mpifxcorr = observation.mpifxcorr if timeout == None: timeout = observation.mpifxcorr_timeout check_outfile(input_file) import difxlog as log # ' -x LD_LIBRARY_PATH' +\ command = mpi + ' -np ' + str(np) +\ ' -machinefile ' + machine_file + ' ' +\ ' -byslot ' +\ mpifxcorr + ' ' + input_file exec_time = get_parameter('EXECUTE TIME (SEC)', input_file) int_time = float(get_parameter('INT TIME (SEC)', input_file)) log.info('Int time = ' + str(int_time)) log.info('num channels = ' + get_parameter('NUM CHANNELS', input_file)) log.info('Vis Buffer Length = ' + get_parameter('VIS BUFFER LENGTH', input_file)) log.info('Blocks Per Send = ' + get_parameter('BLOCKS PER SEND', input_file)) log.info('Data Buffer Factor = ' + get_parameter('DATA BUFFER FACTOR', input_file)) log.info('num Data Segments = ' + get_parameter('NUM DATA SEGMENTS', input_file)) spawn(command, defreg, mpifxcorrSpawnClass, (time(), exec_time, int_time), timeout=timeout) log.info('Correlator Finished')