def trig_cluster_setup(job, category, trigfile, outdir): node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('trig-file', trigfile) node.add_var_opt('output-dir', os.path.abspath(outdir)) return node
def create_dag_node(self): """ Return a CondorDAGNode that represents this entire DAG. """ dir, fname = os.path.split(self.get_dag_path()) job = pipeline.CondorDAGManJob(fname, dir) node = pipeline.CondorDAGNode(job) return node
def injcombiner_setup(job, category, outdir, fmcache, injpattern, inclination): node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('inj-cache', fmcache) node.add_var_opt('output-dir', outdir) node.add_var_opt('inj-string', injpattern) node.add_var_opt('max-inclination', inclination) return node
def onoff_efficiency_setup(job, category, outdir, segdir, offsource, onsource,\ vetodir=None): node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('output-path', outdir) node.add_var_opt('offsource-file', offsource) node.add_var_opt('onsource-file', onsource) node.add_var_opt('segment-dir', segdir) if vetodir: node.add_var_opt('veto-directory', vetodir) return node
def horizon_distance_setup(job, category, ifotag, grb, onoffcache,\ grbdir, outdir): # setup node node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('ifo-tag', ifotag) node.add_var_opt('grb-xml', "%s/grb%s.xml" % (os.path.abspath(grbdir), grb)) node.add_var_opt('cache', os.path.abspath(onoffcache)) node.add_var_opt('output-dir', os.path.abspath(outdir)) return node
def sbv_setup(job, category, trigfile, grb, outdir, grbdir, vetodir=None,\ injfile=None): node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('trig-file', trigfile) node.add_var_opt('grb-name', grb) node.add_var_opt('segment-dir', grbdir) node.add_var_opt('output-path', os.path.abspath(outdir)) if vetodir: node.add_var_opt('veto-directory', vetodir) if injfile: node.add_var_arg('--inj-file %s ' % injfile) return node
def injection_efficiency_setup(job, category, outdir, segdir, offsource,\ onsource, injrun, cp, found, missed,\ vetodir=None): node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('output-path', outdir) node.add_var_opt('upper-inj-dist', cp.getfloat(injrun, 'max-distance') / 1000.) node.add_var_opt('lower-inj-dist', cp.getfloat(injrun, 'min-distance') / 1000.) node.add_var_opt('offsource-file', offsource) node.add_var_opt('onsource-file', onsource) node.add_var_opt('found-file', found) node.add_var_opt('missed-file', missed) node.add_var_opt('segment-dir', segdir) if vetodir: node.add_var_opt('veto-directory', vetodir) return node
def trig_combiner_setup(job, category, ifotag, usertag, grb, onoffcache,\ grbdir, numtrials, outdir, timeslidecache=None,\ slidetag = None): # setup node node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('ifo-tag', ifotag) node.add_var_opt('user-tag', usertag) if slidetag: node.add_var_opt('slide-tag', slidetag) node.add_var_opt('grb-name', grb) node.add_var_opt('segment-dir', os.path.abspath(grbdir)) if onoffcache: node.add_var_opt('cache', os.path.abspath(onoffcache)) node.add_var_opt('num-trials', numtrials) node.add_var_opt('output-dir', os.path.abspath(outdir)) if timeslidecache: node.add_var_opt('slide-cache', timeslidecache) return node
def injfind_setup(job, category, injdir, injrun, ifotag, grb,\ datastart, dataduration): # construct cache file injcachefile = '%s/HL-INJECTION_GRB%s_%s-%s-%s.cache'\ % (injdir, grb, injrun, datastart, dataduration) # if cache does not exist, make one if not os.path.isfile(injcachefile): injfiles = glob.glob('%s/HL-INJECTION_GRB%s_%s_*-%s-%s.xml'\ % (injdir, grb, injrun, datastart, dataduration)) injcache = lal.Cache.from_urls(injfiles) injcache.tofile(open(injcachefile, 'w')) # construct trigger cache file trigcachefile = '%s/%s-INSPIRAL_HIPE_GRB%s_%s-%s-%s.cache'\ % (injdir, ifotag, grb, injrun, datastart, dataduration) # initialise node node = pipeline.CondorDAGNode(job) node.set_category(category) node.add_var_opt('cache', trigcachefile) node.add_var_opt('inj-cache', injcachefile) return node
# tsnode.set_end(analysis_seg[1]) # tsnode.add_var_opt('ifo',ifo) # tsnode.add_file_arg(output_name) # # tsnode.add_parent(lwadd) # dag.add_node(tsnode) # sicluster works in-place, so copy unclustered triggers to # new files for 30 ms and 16 sec clustering clustered_30ms_name = output_name.replace('UNCLUSTERED', '30MILLISEC_CLUSTERED') clustered_16s_name = output_name.replace('UNCLUSTERED', '16SEC_CLUSTERED') for cname in [clustered_30ms_name, clustered_16s_name]: cpnode = pipeline.CondorDAGNode(cp_job) cpnode.add_file_arg(output_name) cpnode.add_file_arg(cname) cpnode.add_parent(lwadd) dag.add_node(cpnode) if cname == clustered_16s_name: sinode = inspiral.InspiralAnalysisNode(si_job_coarse) else: sinode = inspiral.InspiralAnalysisNode(si_job_fine) sinode.add_file_arg(cname) sinode.add_parent(cpnode) dag.add_node(sinode) # write the dag
#position myJob.add_condor_cmd('transfer_input_files','$(macroFileList)') myJob.add_condor_cmd('when_to_transfer_output','on_exit') myJob.add_condor_cmd('initialdir',outputResultsPath) buildDir(outputResultsPath) #Setup dag nodes #Loop over files to process if not cp.has_section('pylibraryfiles'): print "NO [pylibraryfiles] section!\n" os.abort() else: libraryFile=os.path.expanduser(cp.get('pylibraryfiles','pyutilfile')) if not os.path.exists(str(libraryFile)): print "ERROR: Library file not found." os.abort() for thisFile in listOfFiles: myFile=thisFile.strip("\n") thisNode=pipeline.CondorDAGNode(myJob) thisNode.add_macro('macroFileList',str(myFile)+","+libraryFile) if tsUniverse == 'local': thisNode.add_macro('macroFileToProcess',myFile) else: thisNode.add_macro('macroFileToProcess',os.path.basename(myFile)) myDag.add_node(thisNode) #Write out the files that constitute the dag myDag.write_sub_files() myDag.write_dag()
mkdir(log_dir) # Make a directory to hold log files of jobs ### ### Configuration 0: Fit job ### if opts.workflow == 'single' or opts.workflow == 'fit': if opts.workflow == 'fit': cip_args += ' --fit-save-gp my_fit.pkl' single_job, single_job_name = write_CIP_sub( tag='CIP', log_dir=log_dir, arg_str=cip_args, request_memory=opts.request_memory) single_job.write_sub_file() cip_node = pipeline.CondorDAGNode(single_job) cip_node.add_macro("macroevent", 0) cip_node.set_category("CIP") dag.add_node(cip_node) if opts.workflow == 'posterior' or opts.workflow == 'fit+posterior': if opts.workflow == 'fit+posterior': cip_args_fit = cip_args + ' --fit-save-gp my_fit.pkl' cip_args_fit += ' --fname-output-integral integral_fit' # insure output filenames unique if multiple runs performed cip_args_fit += ' --fname-output-samples integral_fit' # insure output filenames unique if multiple runs performed fit_job, fit_job_name = write_CIP_sub( tag='CIP_fit', log_dir=log_dir, arg_str=cip_args_fit, request_memory=opts.request_memory) fit_job.write_sub_file()
adapt_floor_level=opts.adapt_floor_level, adapt_weight_exponent=opts.adapt_weight_exponent, skymap_file=opts.skymap_file, write_eff_lambda=opts.write_eff_lambda, write_deff_lambda=opts.write_deff_lambda) ile_job_type.write_sub_file() if use_bayespe_postproc: if not os.path.exists(opts.web_output): os.makedirs(opts.web_output) bpp_plot_job_type, bpp_plot_job_name = dagutils.write_bayes_pe_postproc_sub( tag="bayes_pp_plot", log_dir=opts.log_directory, web_dir=opts.web_output) bpp_plot_job_type.write_sub_file() bpp_plot_node = pipeline.CondorDAGNode(bpp_plot_job_type) bpp_plot_node.set_category("PLOT") bpp_plot_node.set_pre_script(dagutils.which("bayes_pe_preprocess")) ppdag.add_node(bpp_plot_node) # # Make the posterior plot here since we need to make it the child of every sql # node in the DAG # if use_ile_postproc: pos_plot_job_type, pos_plot_job_name = dagutils.write_posterior_plot_sub( tag="pos_plot", log_dir=opts.log_directory) pos_plot_job_type.write_sub_file() pos_plot_node = pipeline.CondorDAGNode(pos_plot_job_type) pos_plot_node.set_pre_script(dagutils.which("coalesce.sh")) pos_plot_node.set_category("PLOT")
if opts.injections: injfile = os.path.abspath(opts.injections) else: injfile = os.path.join(rundir, 'priorsamples.xml') approx = prior_cp.get('engine', 'approx') prior2injexe = prior_cp.get('condor', 'pos_to_sim_burst') prior2injjob = pipeline.CondorDAGJob('vanilla', prior2injexe) if main_cp.has_option('analysis', 'accounting_group'): prior2injjob.add_condor_cmd('accounting_group', main_cp.get('analysis', 'accounting_group')) prior2injjob.set_sub_file(convertsub) prior2injjob.set_stderr_file(converterr) prior2injjob.set_stdout_file(convertout) prior2injjob.add_condor_cmd('getenv', 'True') prior2injnode = pipeline.CondorDAGNode(prior2injjob) prior2injnode.add_var_opt('output', injfile) prior2injnode.add_var_opt('num-of-injs', str(opts.trials)) prior2injnode.add_var_opt('approx', approx) prior2injnode.add_var_arg(priorfile) prior2injnode.add_parent(priordagnode) # Create the pipeline based on the injections #main_cp.set('input','injection-file',injfile) main_cp.set('input', 'gps-start-time', str(trig_time - 1000)) main_cp.set('input', 'gps-end-time', str(trig_time + 1000)) maindag = pipe_utils.LALInferencePipelineDAG(main_cp) maindag.set_dag_file(os.path.join(maindir, 'lalinference_pipeline')) maindagjob = pipeline.CondorDAGManJob(maindag.get_dag_file(), dir=maindir) maindagnode = pipeline.CondorDAGManNode(maindagjob) maindag.config.set('input', 'burst-injection-file', injfile)
#print "Computing times:" #print gps_range #print gps_stride_per_job for i in range(gps_range[0], gps_range[1], gps_stride_per_job): times.append((i, i + gps_stride_per_job)) #print times # now times contains the start and end for each job in the dag for i in times: #cmds: #time1 = i.split(' ')[-2] time1 = i[0] #time2 = i.split(' ')[-1] time2 = i[1] ifo = interferometer #node = subFile.create_node() node = pipeline.CondorDAGNode(subFile) node.add_var_arg(ifo) node.add_var_arg(str(time1)) node.add_var_arg(str(time2)) dag.add_node(node) print "Writing dag file:" print dag.get_dag_file() dag.write_dag() #print "Writing sub file:" #print dag.get_sub_file() #print dag.get_jobs() dag.write_sub_files() print "Executable and DAG created, please run dag by submitting:" print "condor_submit_dag " + run_dir + 's6publish.dag'
def main(args=None): parser = create_parser() args = parser.parse_args(args=args) # apply verbosity to logger args.verbose = max(5 - args.verbose, 0) logger.setLevel(args.verbose * 10) # validate command line arguments if args.ifo is None: parser.error("Cannot determine IFO prefix from sytem, " "please pass --ifo on the command line") if args.executable is None: parser.error("Cannot find omicron on path, please pass " "--executable on the command line") # validate processing options if all((args.skip_root_merge, args.skip_hdf5_merge, args.skip_ligolw_add, args.skip_gzip, not args.archive)): args.skip_postprocessing = True if args.archive: argsd = vars(args) for arg in [ 'skip-root-merge', 'skip-hdf5-merge', 'skip-ligolw-add', 'skip-gzip' ]: if argsd[arg.replace('-', '_')]: parser.error("Cannot use --%s with --archive" % arg) # check conflicts if args.gps is None and args.cache_file is not None: parser.error("Cannot use --cache-file in 'online' mode, " "please use --cache-file with --gps") # extract key variables ifo = args.ifo group = args.group online = args.gps is None # format file-tag as underscore-delimited upper-case string filetag = args.file_tag if filetag: filetag = re.sub(r'[:_\s-]', '_', filetag).rstrip('_').strip('_') if const.OMICRON_FILETAG.lower() in filetag.lower(): afiletag = filetag else: afiletag = '%s_%s' % (filetag, const.OMICRON_FILETAG.upper()) filetag = '_%s' % filetag else: filetag = '' afiletag = const.OMICRON_FILETAG.upper() logger.info("--- Welcome to the Omicron processor ---") # set up containers to keep track of files that we create here tempfiles = [] keepfiles = [] # check rescue against --dagman-option force if args.rescue and args.dagman_option.count('force') > 1: parser.error('--rescue is incompatible with --dagman-option force') elif args.rescue: args.dagman_option.pop(0) logger.info( "Running in RESCUE mode - the workflow will be " "re-generated in memory without any files being written", ) # set omicron version for future use omicronv = utils.get_omicron_version(args.executable) const.OMICRON_VERSION = str(omicronv) os.environ.setdefault('OMICRON_VERSION', str(omicronv)) logger.debug('Omicron version: %s' % omicronv) # -- parse configuration file and get parameters -------------------------- cp = configparser.ConfigParser() cp.read(args.config_file) # validate if not cp.has_section(group): raise configparser.NoSectionError(group) # get params channels = cp.get(group, 'channels').strip('\n').rstrip('\n').split('\n') try: # allow two-column 'channel samplerate' format channels, crates = zip(*[c.split(' ', 1) for c in channels]) except ValueError: crates = [] else: crates = set(crates) logger.debug("%d channels read" % len(channels)) for i in range(len(channels) - 1, -1, -1): # remove excluded channels c = channels[i] if c in args.exclude_channel: channels.pop(i) logger.debug(" removed %r" % c) logger.debug("%d channels to process" % len(channels)) cp.set(group, 'channels', '\n'.join(channels)) frametype = cp.get(group, 'frametype') logger.debug("frametype = %s" % frametype) chunkdur = cp.getint(group, 'chunk-duration') logger.debug("chunkdur = %s" % chunkdur) segdur = cp.getint(group, 'segment-duration') logger.debug("segdur = %s" % segdur) overlap = cp.getint(group, 'overlap-duration') logger.debug("overlap = %s" % overlap) padding = int(overlap / 2) logger.debug("padding = %s" % padding) try: frange = tuple(map(float, cp.get(group, 'frequency-range').split())) except configparser.NoOptionError as e: try: flow = cp.getfloat(group, 'flow') fhigh = cp.getfloat(group, 'flow') except configparser.NoOptionError: raise e frange = (flow, fhigh) logger.debug('frequencyrange = [%s, %s)' % tuple(frange)) try: sampling = cp.getfloat(group, 'sample-frequency') except configparser.NoOptionError: if len(crates) == 1: sampling = float(crates[0]) elif len(crates) > 1: raise ValueError( "No sample-frequency parameter given, and multiple " "sample frequencies parsed from channels list, " "cannot continue", ) else: sampling = None if sampling: logger.debug('samplingfrequency = %s' % sampling) # get state channel try: statechannel = cp.get(group, 'state-channel') except configparser.NoOptionError: statechannel = None else: try: statebits = list( map( float, cp.get(group, 'state-bits').split(','), )) except configparser.NoOptionError: statebits = [0] try: stateft = cp.get(group, 'state-frametype') except configparser.NoOptionError as e: e.args = ('%s, this must be specified if state-channel is given' % str(e), ) raise # get state flag (if given) try: stateflag = cp.get(group, 'state-flag') except configparser.NoOptionError: stateflag = None else: logger.debug("State flag = %s" % stateflag) if not statechannel: # map state flag to state channel try: statechannel, statebits, stateft = ( segments.STATE_CHANNEL[stateflag]) except KeyError as e: if online or args.no_segdb: # only raise if channel required e.args = ('Cannot map state flag %r to channel' % stateflag, ) raise else: pass if statechannel: logger.debug("State channel = %s" % statechannel) logger.debug("State bits = %s" % ', '.join(map(str, statebits))) logger.debug("State frametype = %s" % stateft) # parse padding for state segments if statechannel or stateflag: try: statepad = cp.get(group, 'state-padding') except configparser.NoOptionError: statepad = (0, 0) else: try: p = int(statepad) except ValueError: statepad = tuple(map(float, statepad.split(',', 1))) else: statepad = (p, p) logger.debug("State padding: %s" % str(statepad)) rundir = utils.get_output_path(args) # convert to omicron parameters format oconfig = parameters.OmicronParameters.from_channel_list_config( cp, group, version=omicronv) # and validate things oconfig.validate() # -- set directories ------------------------------------------------------ rundir.mkdir(exist_ok=True, parents=True) logger.info("Using run directory\n%s" % rundir) cachedir = rundir / "cache" condir = rundir / "condor" logdir = rundir / "logs" pardir = rundir / "parameters" trigdir = rundir / "triggers" for d in [cachedir, condir, logdir, pardir, trigdir]: d.mkdir(exist_ok=True) oconfig.set('OUTPUT', 'DIRECTORY', str(trigdir)) # -- check for an existing process ---------------------------------------- dagpath = condir / "{}.dag".format(DAG_TAG) # check dagman lock file running = condor.dag_is_running(dagpath) if running: msg = "Detected {} already running in {}".format( dagpath, rundir, ) if not args.reattach: raise RuntimeError(msg) logger.info("{}, will reattach".format(msg)) else: args.reattach = False # check dagman rescue files nrescue = len( list(condir.glob("{}.rescue[0-9][0-9][0-9]".format(dagpath.name), ))) if args.rescue and not nrescue: raise RuntimeError( "--rescue given but no rescue DAG files found for {}".format( dagpath, ), ) if nrescue and not args.rescue and "force" not in args.dagman_option: raise RuntimeError( "rescue DAGs found for {} but `--rescue` not given and " "`--dagman-option force` not given, cannot continue".format( dagpath, ), ) newdag = not args.rescue and not args.reattach # -- find run segment ----------------------------------------------------- segfile = str(rundir / "segments.txt") keepfiles.append(segfile) if newdag and online: # get limit of available data (allowing for padding) end = data.get_latest_data_gps(ifo, frametype) - padding try: # start from where we got to last time start = segments.get_last_run_segment(segfile)[1] except IOError: # otherwise start with a sensible amount of data if args.use_dev_shm: # process one chunk logger.debug("No online segment record, starting with " "%s seconds" % chunkdur) start = end - chunkdur + padding else: # process the last 4000 seconds (arbitrarily) logger.debug("No online segment record, starting with " "4000 seconds") start = end - 4000 else: logger.debug("Online segment record recovered") elif online: start, end = segments.get_last_run_segment(segfile) else: start, end = args.gps duration = end - start datastart = start - padding dataend = end + padding dataduration = dataend - datastart logger.info("Processing segment determined as") logger.info(" %d %d" % (datastart, dataend)) logger.info("Duration = %d seconds" % dataduration) span = (start, end) # -- find segments and frame files ---------------------------------------- # minimum allowed duration is one full chunk minduration = 1 * chunkdur # validate span is long enough if dataduration < minduration and online: logger.info("Segment is too short (%d < %d), please try again later" % (duration, minduration)) clean_exit(0, tempfiles) elif dataduration < minduration: raise ValueError( "Segment [%d, %d) is too short (%d < %d), please " "extend the segment, or shorten the timing parameters." % (start, end, duration, chunkdur - padding * 2), ) # -- find run segments # get segments from state vector if (online and statechannel) or (statechannel and not stateflag) or (statechannel and args.no_segdb): logger.info("Finding segments for relevant state...") if statebits == "guardian": # use guardian segs = segments.get_guardian_segments( statechannel, stateft, datastart, dataend, pad=statepad, ) else: segs = segments.get_state_segments( statechannel, stateft, datastart, dataend, bits=statebits, pad=statepad, ) # get segments from segment database elif stateflag: logger.info("Querying segments for relevant state...") segs = segments.query_state_segments(stateflag, datastart, dataend, pad=statepad) # get segments from frame availability else: segs = segments.get_frame_segments(ifo, frametype, datastart, dataend) # print frame segments recovered if len(segs): logger.info("State/frame segments recovered as") for seg in segs: logger.info(" %d %d [%d]" % (seg[0], seg[1], abs(seg))) logger.info("Duration = %d seconds" % abs(segs)) # if running online, we want to avoid processing up to the extent of # available data, so that the next run doesn't get left with a segment that # is too short to process # There are a few reasons this might be # - the interferometer loses lock a short time after the end of this run # - a restart/other problem means that a frame is missing a short time # after the end of this run # so, work out whether we need to truncate: try: lastseg = segs[-1] except IndexError: truncate = False else: truncate = online and newdag and lastseg[1] == dataend # if final segment is shorter than two chunks, remove it entirely # so that it gets processed next time (when it will either a closed # segment, or long enough to process safely) if truncate and abs(lastseg) < chunkdur * 2: logger.info( "The final segment is too short, but ends at the limit of " "available data, presumably this is an active segment. It " "will be removed so that it can be processed properly later", ) segs = type(segs)(segs[:-1]) dataend = lastseg[0] # otherwise, we remove the final chunk (so that the next run has at # least that on which to operate), then truncate to an integer number # of chunks (so that # PSD estimation operates on a consistent amount # of data) elif truncate: logger.info("The final segment touches the limit of available data, " "the end chunk will be removed to guarantee that the next " "online run has enough data over which to operate") t, e = lastseg e -= chunkdur + padding # remove one chunk # now truncate to an integer number of chunks step = chunkdur while t + chunkdur <= e: t += step step = chunkdur - overlap segs[-1] = type(segs[-1])(lastseg[0], t) dataend = segs[-1][1] logger.info("This analysis will now run to %d" % dataend) # recalculate the processing segment dataspan = type(segs)([segments.Segment(datastart, dataend)]) # -- find the frames # find frames under /dev/shm (which creates a cache of temporary files) if args.cache_file: cache = read_cache(str(args.cache_file)) # only cache if we have state segments elif args.use_dev_shm and len(segs): cache = data.find_frames(ifo, frametype, datastart, dataend, on_gaps='warn', tmpdir=cachedir) # remove cached files at end of process tempfiles.extend(filter(lambda p: str(cachedir) in p, cache)) # find frames using datafind else: cache = data.find_frames(ifo, frametype, datastart, dataend, on_gaps='warn') # if not frames for an online run, panic if not online and len(cache) == 0: raise RuntimeError("No frames found for %s-%s" % (ifo[0], frametype)) # work out the segments of data available try: cachesegs = (segments.cache_segments(cache) & dataspan).coalesce() except TypeError: # empty cache cachesegs = type(dataspan)() alldata = False else: try: alldata = cachesegs[-1][1] >= dataspan[-1][1] except IndexError: # no data overlapping span alldata = False # write cache of frames (only if creating a new DAG) cachefile = cachedir / "frames.lcf" keepfiles.append(cachefile) if newdag: data.write_cache(cache, cachefile) oconfig.set('DATA', 'FFL', str(cachefile)) logger.info("Cache of %d frames written to\n%s" % (len(cache), cachefile)) # restrict analysis to available data (and warn about missing data) if segs - cachesegs: logger.warning("Not all state times are available in frames") segs = (cachesegs & segs).coalesce() # apply minimum duration requirement segs = type(segs)(s for s in segs if abs(s) >= segdur) # if all of the data are available, but no analysable segments were found # (i.e. IFO not in right state for all times), record segments.txt if newdag and len(segs) == 0 and online and alldata: logger.info( "No analysable segments found, but up-to-date data are " "available. A segments.txt file will be written so we don't " "have to search these data again", ) segments.write_segments(cachesegs, segfile) logger.info("Segments written to\n%s" % segfile) clean_exit(0, tempfiles) # otherwise not all data are available, so elif len(segs) == 0 and online: logger.info("No analysable segments found, please try again later") clean_exit(0, tempfiles) elif len(segs) == 0: raise RuntimeError("No analysable segments found") # and calculate trigger output segments trigsegs = type(segs)(type(s)(*s) for s in segs).contract(padding) # display segments logger.info("Final data segments selected as") for seg in segs: logger.info(" %d %d " % seg + "[%d]" % abs(seg)) logger.info("Duration = %d seconds" % abs(segs)) span = type(trigsegs)([trigsegs.extent()]) logger.info("This will output triggers for") for seg in trigsegs: logger.info(" %d %d " % seg + "[%d]" % abs(seg)) logger.info("Duration = %d seconds" % abs(trigsegs)) # -- config omicron config directory -------------------------------------- tempfiles.append(utils.astropy_config_path(rundir)) # -- make parameters files then generate the DAG -------------------------- fileformats = oconfig.output_formats() # generate a 'master' parameters.txt file for archival purposes if not newdag: # if not writing new dag, dump parameters.txt files to /tmp pardir = gettempdir() parfile, jobfiles = oconfig.write_distributed( pardir, nchannels=args.max_channels_per_job) logger.debug("Created master parameters file\n%s" % parfile) if newdag: keepfiles.append(parfile) # create dag dag = pipeline.CondorDAG(str(logdir / "{}.log".format(DAG_TAG))) dag.set_dag_file(str(dagpath.with_suffix(""))) # set up condor commands for all jobs condorcmds = { 'accounting_group': args.condor_accounting_group, 'accounting_group_user': args.condor_accounting_group_user } for cmd_ in args.condor_command: key, value = cmd_.split('=', 1) condorcmds[key.rstrip().lower()] = value.strip() # create omicron job reqmem = condorcmds.pop('request_memory', 1000) ojob = condor.OmicronProcessJob(args.universe, args.executable, subdir=condir, logdir=logdir, **condorcmds) ojob.add_condor_cmd('request_memory', reqmem) ojob.add_condor_cmd('+OmicronProcess', '"%s"' % group) # create post-processing job ppjob = condor.OmicronProcessJob(args.universe, find_executable('bash'), subdir=condir, logdir=logdir, tag='post-processing', **condorcmds) ppjob.add_condor_cmd('+OmicronPostProcess', '"%s"' % group) ppjob.add_short_opt('e', '') ppnodes = [] rootmerge = find_executable('omicron-root-merge') hdf5merge = find_executable('omicron-hdf5-merge') ligolw_add = find_executable('ligolw_add') gzip = find_executable('gzip') # create node to remove files rmjob = condor.OmicronProcessJob(args.universe, str(condir / "post-process-rm.sh"), subdir=condir, logdir=logdir, tag='post-processing-rm', **condorcmds) rm = find_executable('rm') rmfiles = [] rmjob.add_condor_cmd('+OmicronPostProcess', '"%s"' % group) if args.archive: archivejob = condor.OmicronProcessJob(args.universe, str(condir / "archive.sh"), subdir=condir, logdir=logdir, tag='archive', **condorcmds) archivejob.add_condor_cmd('+OmicronPostProcess', '"%s"' % group) archivefiles = {} # loop over data segments for s, e in segs: # build trigger segment ts = s + padding te = e - padding td = te - ts # distribute segment across multiple nodes nodesegs = oconfig.distribute_segment(s, e, nperjob=args.max_chunks_per_job) omicronfiles = {} # build node for each parameter file for i, pf in enumerate(jobfiles): chanlist = jobfiles[pf] nodes = [] # loop over distributed segments for subseg in nodesegs: if not args.skip_omicron: # work out files for this job nodefiles = oconfig.output_files(*subseg) # build node node = pipeline.CondorDAGNode(ojob) node.set_category('omicron') node.set_retry(args.condor_retry) node.add_var_arg(str(subseg[0])) node.add_var_arg(str(subseg[1])) node.add_file_arg(pf) for chan in chanlist: for form, flist in nodefiles[chan].items(): # record file as output from this node for f in flist: node._CondorDAGNode__output_files.append(f) # record file as output for this channel try: omicronfiles[chan][form].extend(flist) except KeyError: try: omicronfiles[chan][form] = flist except KeyError: omicronfiles[chan] = {form: flist} dag.add_node(node) nodes.append(node) # post-process (one post-processing job per channel # per data segment) if not args.skip_postprocessing: script = condir / "post-process-{}-{}-{}.sh".format(i, s, e) ppnode = pipeline.CondorDAGNode(ppjob) ppnode.add_var_arg(str(script)) operations = [] # build post-processing nodes for each channel for c in chanlist: operations.append('\n# %s' % c) chandir = trigdir / c # work out filenames for coalesced files archpath = Path( io.get_archive_filename( c, ts, td, filetag=afiletag, ext='root', )) mergepath = chandir / archpath.name target = str(archpath.parent) # add ROOT operations if 'root' in fileformats: rootfiles = ' '.join(omicronfiles[c]['root']) for f in omicronfiles[c]['root']: ppnode._CondorDAGNode__input_files.append(f) if args.skip_root_merge or (len( omicronfiles[c]['root']) == 1): root = rootfiles else: root = str(mergepath) operations.append('%s %s %s --strict' % (rootmerge, rootfiles, root)) rmfiles.append(rootfiles) ppnode._CondorDAGNode__output_files.append(root) if args.archive: try: archivefiles[target].append(root) except KeyError: archivefiles[target] = [root] rmfiles.append(root) # add HDF5 operations if 'hdf5' in fileformats: hdf5files = ' '.join(omicronfiles[c]['hdf5']) for f in omicronfiles[c]['hdf5']: ppnode._CondorDAGNode__input_files.append(f) if args.skip_hdf5_merge or (len( omicronfiles[c]['hdf5']) == 1): hdf5 = hdf5files else: hdf5 = str(mergepath.with_suffix(".h5")) operations.append( '{cmd} {infiles} {outfile}'.format( cmd=hdf5merge, infiles=hdf5files, outfile=hdf5, ), ) rmfiles.append(hdf5files) ppnode._CondorDAGNode__output_files.append(hdf5) if args.archive: try: archivefiles[target].append(hdf5) except KeyError: archivefiles[target] = [hdf5] rmfiles.append(hdf5) # add LIGO_LW operations if 'xml' in fileformats: xmlfiles = ' '.join(omicronfiles[c]['xml']) for f in omicronfiles[c]['xml']: ppnode._CondorDAGNode__input_files.append(f) if (args.skip_ligolw_add or len(omicronfiles[c]['xml']) == 1): xml = xmlfiles else: xml = str(mergepath.with_suffix(".xml")) operations.append( '%s %s --ilwdchar-compat --output %s' % (ligolw_add, xmlfiles, xml), ) rmfiles.append(xmlfiles) ppnode._CondorDAGNode__output_files.append(xml) if not args.skip_gzip: operations.append( '%s --force --stdout %s > %s.gz' % (gzip, xml, xml)) rmfiles.append(xml) xml = str(mergepath.with_suffix(".xml.gz")) ppnode._CondorDAGNode__output_files.append(xml) if args.archive: try: archivefiles[target].append(xml) except KeyError: archivefiles[target] = [xml] rmfiles.append(xml) # add ASCII operations if 'txt' in fileformats: txtfiles = ' '.join(omicronfiles[c]['txt']) for f in omicronfiles[c]['txt']: ppnode._CondorDAGNode__input_files.append(f) if args.archive: try: archivefiles[target].append(txtfiles) except KeyError: archivefiles[target] = [txtfiles] rmfiles.append(txtfiles) ppnode.set_category('postprocessing') ppnode.set_retry(str(args.condor_retry)) if not args.skip_omicron: for node in nodes: ppnode.add_parent(node) dag.add_node(ppnode) ppnodes.append(ppnode) tempfiles.append(script) # write post-processing file if not args.rescue: with script.open("w") as f: # add header print('#!/bin/bash -e\n#', file=f) print("# omicron-process post-processing", file=f) print( '#\n# File created by\n# {}\n#'.format( ' '.join(sys.argv), ), file=f, ) print("# Group: %s" % group, file=f) print("# Segment: [%d, %d)" % (s, e), file=f) print("# Channels:\n#", file=f) for c in chanlist: print('# %s' % c, file=f) # add post-processing operations print('\n'.join(operations), file=f) if newdag: script.chmod(0o755) # set 'strict' option for Omicron # this is done after the nodes are written so that 'strict' is last in # the call ojob.add_arg('strict') # do all archiving last, once all post-processing has completed if args.archive: archivenode = pipeline.CondorDAGNode(archivejob) acache = {fmt: list() for fmt in fileformats} if newdag: # write shell script to seed archive with open(archivejob.get_executable(), 'w') as f: print('#!/bin/bash -e\n', file=f) for gpsdir, filelist in archivefiles.items(): for fn in filelist: archivenode._CondorDAGNode__input_files.append(fn) # write 'mv' op to script print("mkdir -p %s" % gpsdir, file=f) print("cp %s %s" % (' '.join(filelist), gpsdir), file=f) # record archived files in caches filenames = [ str(Path(gpsdir) / x.name) for x in map(Path, filelist) ] for fn in filenames: archivenode._CondorDAGNode__output_files.append(fn) for fmt, extensions in { 'xml': ('.xml.gz', '.xml'), 'root': '.root', 'hdf5': '.h5', 'txt': '.txt', }.items(): try: acache[fmt].extend( filter(lambda x: x.endswith(extensions), filenames)) except KeyError: # file format not used continue os.chmod(archivejob.get_executable(), 0o755) # write caches to disk for fmt, fcache in acache.items(): cachefile = cachedir / "omicron-{0}.lcf".format(fmt) data.write_cache(fcache, cachefile) logger.debug("{0} cache written to {1}".format(fmt, cachefile)) # add node to DAG for node in ppnodes: archivenode.add_parent(node) archivenode.set_retry(args.condor_retry) archivenode.set_category('archive') dag.add_node(archivenode) tempfiles.append(archivejob.get_executable()) # add rm job right at the end rmnode = pipeline.CondorDAGNode(rmjob) rmscript = rmjob.get_executable() with open(rmscript, 'w') as f: print('#!/bin/bash -e\n#', file=f) print("# omicron-process post-processing-rm", file=f) print('#\n# File created by\n# %s\n#' % ' '.join(sys.argv), file=f) print("# Group: %s" % group, file=f) print("# Segment: [%d, %d)" % (s, e), file=f) print("# Channels:\n#", file=f) for c in channels: print('# %s' % c, file=f) print('', file=f) for rmset in rmfiles: print('%s -f %s' % (rm, rmset), file=f) if newdag: os.chmod(rmscript, 0o755) tempfiles.append(rmscript) rmnode.set_category('postprocessing') if args.archive: # run this after archiving rmnode.add_parent(archivenode) else: # or just after post-processing if not archiving for node in ppnodes: rmnode.add_parent(node) dag.add_node(rmnode) # print DAG to file dagfile = Path(dag.get_dag_file()).resolve(strict=False) if args.rescue: logger.info( "In --rescue mode, this DAG has been reproduced in memory " "for safety, but will not be written to disk, the file is:", ) elif newdag: dag.write_sub_files() dag.write_dag() dag.write_script() with open(dagfile, 'a') as f: print("DOT", dagfile.with_suffix(".dot"), file=f) logger.info("Dag with %d nodes written to" % len(dag.get_nodes())) print(dagfile) if args.no_submit: if newdag: segments.write_segments(span, segfile) logger.info("Segments written to\n%s" % segfile) sys.exit(0) # -- submit the DAG and babysit ------------------------------------------- # submit DAG if args.rescue: logger.info("--- Submitting rescue DAG to condor ----") elif args.reattach: logger.info("--- Reattaching to existing DAG --------") else: logger.info("--- Submitting DAG to condor -----------") for i in range(args.submit_rescue_dag + 1): if args.reattach: # find ID of existing DAG dagid = int( condor.find_job(Owner=getuser(), OmicronDAGMan=group)['ClusterId']) logger.info("Found existing condor ID = %d" % dagid) else: # or submit DAG dagmanargs = set() if online: dagmanopts = {'-append': '+OmicronDAGMan=\"%s\"' % group} else: dagmanopts = {} for x in args.dagman_option: x = '-%s' % x try: key, val = x.split('=', 1) except ValueError: dagmanargs.add(x) else: dagmanopts[key] = val dagid = condor.submit_dag( str(dagfile), *list(dagmanargs), **dagmanopts, ) logger.info("Condor ID = %d" % dagid) # write segments now -- this means that online processing will # _always_ move on even if the workflow fails if i == 0: segments.write_segments(span, segfile) logger.info("Segments written to\n%s" % segfile) if 'force' in args.dagman_option: args.dagman_option.pop(args.dagman_option.index('force')) # monitor the dag logger.debug("----------------------------------------") logger.info("Monitoring DAG:") check_call([ "pycondor", "monitor", "--time", "5", "--length", "36", str(dagfile), ]) print() logger.debug("----------------------------------------") sleep(5) try: stat = condor.get_dag_status(dagid) except OSError as exc: # query failed logger.warning(str(exc)) stat = {} # log exitcode if "exitcode" not in stat: logger.warning("DAG has exited, status unknown") break if not stat["exitcode"]: logger.info("DAG has exited with status {}".format( stat.get("exitcode", "unknown"), )) break logger.critical( "DAG has exited with status {}".format(stat['exitcode']), ) # handle failure if i == args.submit_rescue_dag: raise RuntimeError("DAG has failed to complete %d times" % (args.submit_rescue_dag + 1)) else: rescue = condor.find_rescue_dag(str(dagfile)) logger.warning("Rescue DAG %s was generated" % rescue) # mark output and error files of condor nodes that passed to be deleted try: for node, files in condor.get_out_err_files(dagid, exitcode=0).items(): tempfiles.extend(files) except RuntimeError: pass # archive files stub = '%d-%d' % (start, end) for f in map(Path, ["{}.dagman.out".format(dagfile)] + keepfiles): archive = logdir / "{0[0]}.{1}.{0[1]}".format( f.name.split(".", 1), stub, ) if str(f) == str(segfile): shutil.copyfile(f, archive) else: f.rename(archive) logger.debug("Archived path\n{} --> {}".format(f, archive)) # clean up temporary files tempfiles.extend(trigdir.glob("ffconvert.*.ffl")) clean_tempfiles(tempfiles) # and exit logger.info("--- Processing complete ----------------")