def make_calc_likelihood_fragment(dag, parents, likelihood_parents, tag, files_per_calc_likelihood=None, verbose=False): if files_per_calc_likelihood is None: files_per_calc_likelihood = calc_likelihoodjob.files_per_calc_likelihood input_cache = power.collect_output_caches(parents) likelihood_cache = power.collect_output_caches(likelihood_parents) nodes = set() while input_cache: node = CalcLikelihoodNode(calc_likelihoodjob) node.add_input_cache([ cache_entry for cache_entry, parent in input_cache[:files_per_calc_likelihood] ]) for parent in set(parent for cache_entry, parent in input_cache[:files_per_calc_likelihood]): node.add_parent(parent) del input_cache[:files_per_calc_likelihood] seg = power.cache_span(node.get_input_cache()) node.set_name("lalapps_string_calc_likelihood_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) for cache_entry, parent in likelihood_cache: node.add_parent(parent) node.add_likelihood_cache([cache_entry]) dag.add_node(node) nodes.add(node) return nodes
def make_meas_likelihood_fragment(dag, parents, tag, files_per_meas_likelihood=None): if files_per_meas_likelihood is None: files_per_meas_likelihood = meas_likelihoodjob.files_per_meas_likelihood nodes = set() input_cache = power.collect_output_caches(parents) while input_cache: node = MeasLikelihoodNode(meas_likelihoodjob) node.add_input_cache([ cache_entry for cache_entry, parent in input_cache[:files_per_meas_likelihood] ]) for parent in set(parent for cache_entry, parent in input_cache[:files_per_meas_likelihood]): node.add_parent(parent) del input_cache[:files_per_meas_likelihood] seg = power.cache_span(node.get_input_cache()) node.set_name("lalapps_string_meas_likelihood_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) node.set_output(tag) dag.add_node(node) nodes.add(node) return nodes
def make_run_sqlite_fragment(dag, parents, tag, sql_file, files_per_run_sqlite=None): if files_per_run_sqlite is None: files_per_run_sqlite = runsqlitejob.files_per_run_sqlite nodes = set() input_cache = power.collect_output_caches(parents) while input_cache: node = RunSqliteNode(runsqlitejob) node.set_sql_file(sql_file) node.add_input_cache([ cache_entry for cache_entry, parent in input_cache[:files_per_run_sqlite] ]) for parent in set( parent for cache_entry, parent in input_cache[:files_per_run_sqlite]): node.add_parent(parent) del input_cache[:files_per_run_sqlite] seg = power.cache_span(node.get_output_cache()) node.set_name("lalapps_run_sqlite_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) dag.add_node(node) nodes.add(node) return nodes
def make_string_segment_fragment(dag, datafindnodes, instrument, seg, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length, binjnodes=set(), verbose=False): """ Construct a DAG fragment for an entire segment, splitting the segment into multiple trigger generator jobs. """ # figure out which binj nodes, if any, produce output for this job binjnodes = set( node for node in binjnodes if power.cache_span(node.get_output_cache()).intersects(seg)) # only one frame cache file can be provided as input, and only one # injection description file can be provided as input. # the unpacking indirectly tests that the file count is correct [framecache] = [node.get_output() for node in datafindnodes] if binjnodes: [simfile] = [ cache_entry.path for node in binjnodes for cache_entry in node.get_output_cache() ] injargs = {"injection-file": simfile} else: injargs = {} seglist = split_segment(seg, min_segment_length, pad, overlap, short_segment_duration, max_job_length) if verbose: print >> sys.stderr, "Segment split: " + str(seglist) nodes = set() for seg in seglist: nodes |= make_string_fragment(dag, datafindnodes | binjnodes, instrument, seg, tag, framecache, injargs=injargs) return nodes
def make_meas_likelihood_fragment(dag, parents, tag, files_per_meas_likelihood = None): if files_per_meas_likelihood is None: files_per_meas_likelihood = meas_likelihoodjob.files_per_meas_likelihood nodes = set() input_cache = power.collect_output_caches(parents) while input_cache: node = MeasLikelihoodNode(meas_likelihoodjob) node.add_input_cache([cache_entry for cache_entry, parent in input_cache[:files_per_meas_likelihood]]) for parent in set(parent for cache_entry, parent in input_cache[:files_per_meas_likelihood]): node.add_parent(parent) del input_cache[:files_per_meas_likelihood] seg = power.cache_span(node.get_input_cache()) node.set_name("lalapps_string_meas_likelihood_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) node.set_output(tag) dag.add_node(node) nodes.add(node) return nodes
def make_run_sqlite_fragment(dag, parents, tag, sql_file, files_per_run_sqlite = None): if files_per_run_sqlite is None: files_per_run_sqlite = runsqlitejob.files_per_run_sqlite nodes = set() input_cache = power.collect_output_caches(parents) while input_cache: node = RunSqliteNode(runsqlitejob) node.set_sql_file(sql_file) node.add_input_cache([cache_entry for cache_entry, parent in input_cache[:files_per_run_sqlite]]) for parent in set(parent for cache_entry, parent in input_cache[:files_per_run_sqlite]): node.add_parent(parent) del input_cache[:files_per_run_sqlite] seg = power.cache_span(node.get_output_cache()) node.set_name("lalapps_run_sqlite_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) dag.add_node(node) nodes.add(node) return nodes
def make_calc_likelihood_fragment(dag, parents, likelihood_parents, tag, files_per_calc_likelihood = None, verbose = False): if files_per_calc_likelihood is None: files_per_calc_likelihood = calc_likelihoodjob.files_per_calc_likelihood input_cache = power.collect_output_caches(parents) likelihood_cache = power.collect_output_caches(likelihood_parents) nodes = set() while input_cache: node = CalcLikelihoodNode(calc_likelihoodjob) node.add_input_cache([cache_entry for cache_entry, parent in input_cache[:files_per_calc_likelihood]]) for parent in set(parent for cache_entry, parent in input_cache[:files_per_calc_likelihood]): node.add_parent(parent) del input_cache[:files_per_calc_likelihood] seg = power.cache_span(node.get_input_cache()) node.set_name("lalapps_string_calc_likelihood_%s_%d_%d" % (tag, int(seg[0]), int(abs(seg)))) for cache_entry, parent in likelihood_cache: node.add_parent(parent) node.add_likelihood_cache([cache_entry]) dag.add_node(node) nodes.add(node) return nodes
def make_string_segment_fragment(dag, datafindnodes, instrument, seg, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length, binjnodes = set(), verbose = False): """ Construct a DAG fragment for an entire segment, splitting the segment into multiple trigger generator jobs. """ # figure out which binj nodes, if any, produce output for this job binjnodes = set(node for node in binjnodes if power.cache_span(node.get_output_cache()).intersects(seg)) # only one frame cache file can be provided as input, and only one # injection description file can be provided as input. # the unpacking indirectly tests that the file count is correct [framecache] = [node.get_output() for node in datafindnodes] if binjnodes: [simfile] = [cache_entry.path for node in binjnodes for cache_entry in node.get_output_cache()] injargs = {"injection-file": simfile} else: injargs = {} seglist = split_segment(seg, min_segment_length, pad, overlap, short_segment_duration, max_job_length) if verbose: print("Segment split: " + str(seglist), file=sys.stderr) nodes = set() for seg in seglist: nodes |= make_string_fragment(dag, datafindnodes | binjnodes, instrument, seg, tag, framecache, injargs = injargs) return nodes
def make_coinc_branch(dag, datafinds, seglists, time_slides, min_segment_length, pad, overlap, short_segment_duration, tag, vetoes_cache = set(), do_injections = False, injections_offset = 0.0, verbose = False): # # injection job # binjnodes = set() if do_injections: # don't know what to do with more than one list of offset # vectors assert len(time_slides) == 1 # get the largest injection offset's magnitude maxoffset = max(abs(offset) for offsetvectorlist in time_slides.values() for offsetvector in offsetvectorlist for offset in offsetvector.values()) # to save disk space and speed the dag along we don't # generate a single injection list for the entire analysis # run, instead a separate list is constructed for each # block of data to be analyzed. we need to be careful that # two nearby injection lists don't contain injections for # the same time, so we protract the segments by the time # step and coalesce so that only gaps between segments # larger than twice the time step result in separate files # being generated. we could allow smaller gaps to survive, # but this way we don't have to worry about it. # injections_offset is a number between 0 and 1 in units of # the period between injections for seg in seglists.union(seglists).protract(power.binjjob.time_step + maxoffset).coalesce().contract(power.binjjob.time_step + maxoffset): binjnodes |= power.make_binj_fragment(dag, seg.protract(maxoffset), time_slides.keys()[0], tag, offset = injections_offset) # artificial parent-child relationship to induce dagman to # submit binj jobs as the corresponding datafinds complete # instead of submiting all of one kind before any of the next. # makes dag run faster because it allows string search jobs to # start moving onto the cluster without waiting for all the # datafinds and/or all the binjs to complete for datafindnode in datafinds: seg = segments.segment(datafindnode.get_start(), datafindnode.get_end()) for binjnode in binjnodes: if seg.intersects(power.cache_span(binjnode.get_output_cache())): binjnode.add_parent(datafindnode) # # trigger generator jobs # # set max job length to ~3600 s (will be clipped to an allowed # size) trigger_nodes = cosmicstring.make_single_instrument_stage(dag, datafinds, seglists, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length = 3600, binjnodes = binjnodes, verbose = verbose) # # coincidence analysis # coinc_nodes = [] for n, (time_slides_cache_entry, these_time_slides) in enumerate(time_slides.items()): if verbose: print >>sys.stderr, "%s %d/%d (%s):" % (tag, n + 1, len(time_slides), time_slides_cache_entry.path) coinc_nodes.append(set()) # # lalapps_cafe & ligolw_add # tisi_cache = set([time_slides_cache_entry]) lladd_nodes = set() for seg, parents, cache, clipseg in power.group_coinc_parents(trigger_nodes, these_time_slides, extentlimit = 50000000.0 / (len(these_time_slides) or 1), verbose = verbose): binj_cache = set(cache_entry for node in binjnodes for cache_entry in node.get_output_cache() if cache_entry.segment.intersects(seg)) # otherwise too many copies of the offset vector # will be fed into burca assert len(binj_cache) < 2 if do_injections: # lalapps_binj has already copied the time # slide document into its own output extra_input_cache = vetoes_cache else: # ligolw_add needs to copy the time slide # document into its output extra_input_cache = tisi_cache | vetoes_cache these_lladd_nodes = power.make_lladd_fragment(dag, parents | binjnodes, "%s_%d" % (tag, n), segment = seg, input_cache = cache | binj_cache, extra_input_cache = extra_input_cache, remove_input = do_injections and clipseg is not None, preserve_cache = binj_cache | tisi_cache | vetoes_cache) if clipseg is not None: # # this is a fragment of a too-large burca # job, construct it specially and add the # command-line option needed to clip the # output # assert len(these_lladd_nodes) == 1 coinc_nodes[-1] |= power.make_burca_fragment(dag, these_lladd_nodes, "%s_%d" % (tag, n), coincidence_segments = segments.segmentlist([clipseg]), verbose = verbose) else: # # this is not a fragment of a too-large # burca job, add it to the pool of files to # be processed by the burcas that don't # require special clipping command line # options # lladd_nodes |= these_lladd_nodes # # lalapps_burca pool. these are the burca jobs that don't # require special clipping command line options, and so can # bulk-process many files with each job # if verbose: print >>sys.stderr, "building burca jobs ..." coinc_nodes[-1] |= power.make_burca_fragment(dag, lladd_nodes, "%s_%d" % (tag, n), verbose = verbose) if verbose: print >>sys.stderr, "done %s %d/%d" % (tag, n + 1, len(time_slides)) # # lalapps_binjfind # if do_injections: if verbose: print >>sys.stderr, "building binjfind jobs ..." coinc_nodes = [power.make_binjfind_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose = verbose) for n, these_coinc_nodes in enumerate(coinc_nodes)] # # ligolw_sqlite and lalapps_run_sqlite # if verbose: print >>sys.stderr, "building sqlite jobs ..." coinc_nodes = [power.make_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose = verbose) for n, these_coinc_nodes in enumerate(coinc_nodes)] coinc_nodes = [cosmicstring.make_run_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), clipsegments_sql_filename) for n, these_coinc_nodes in enumerate(coinc_nodes)] # # lalapps_string_meas_likelihood # if verbose: print >>sys.stderr, "building lalapps_string_meas_likelihood jobs ..." likelihood_nodes = [cosmicstring.make_meas_likelihood_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n)) for n, these_coinc_nodes in enumerate(coinc_nodes)] # # write output cache # if verbose: print >>sys.stderr, "writing output cache ..." for n, (these_coinc_nodes, these_likelihood_nodes) in enumerate(zip(coinc_nodes, likelihood_nodes)): power.write_output_cache(these_coinc_nodes | these_likelihood_nodes, "%s_%s_output.cache" % (os.path.splitext(dag.get_dag_file())[0], "%s_%d" % (tag, n))) # # done # return coinc_nodes, likelihood_nodes
def make_coinc_branch(dag, datafinds, seglists, time_slides, min_segment_length, pad, overlap, short_segment_duration, tag, vetoes_cache=set(), do_injections=False, injections_offset=0.0, verbose=False): # # injection job # binjnodes = set() if do_injections: # don't know what to do with more than one list of offset # vectors assert len(time_slides) == 1 # get the largest injection offset's magnitude maxoffset = max( abs(offset) for offsetvectorlist in time_slides.values() for offsetvector in offsetvectorlist for offset in offsetvector.values()) # to save disk space and speed the dag along we don't # generate a single injection list for the entire analysis # run, instead a separate list is constructed for each # block of data to be analyzed. we need to be careful that # two nearby injection lists don't contain injections for # the same time, so we protract the segments by the time # step and coalesce so that only gaps between segments # larger than twice the time step result in separate files # being generated. we could allow smaller gaps to survive, # but this way we don't have to worry about it. # injections_offset is a number between 0 and 1 in units of # the period between injections for seg in seglists.union(seglists).protract( power.binjjob.time_step + maxoffset).coalesce().contract(power.binjjob.time_step + maxoffset): binjnodes |= power.make_binj_fragment(dag, seg.protract(maxoffset), time_slides.keys()[0], tag, offset=injections_offset) # artificial parent-child relationship to induce dagman to # submit binj jobs as the corresponding datafinds complete # instead of submiting all of one kind before any of the next. # makes dag run faster because it allows string search jobs to # start moving onto the cluster without waiting for all the # datafinds and/or all the binjs to complete for datafindnode in datafinds: seg = segments.segment(datafindnode.get_start(), datafindnode.get_end()) for binjnode in binjnodes: if seg.intersects(power.cache_span( binjnode.get_output_cache())): binjnode.add_parent(datafindnode) # # trigger generator jobs # # set max job length to ~3600 s (will be clipped to an allowed # size) trigger_nodes = cosmicstring.make_single_instrument_stage( dag, datafinds, seglists, tag, min_segment_length, pad, overlap, short_segment_duration, max_job_length=3600, binjnodes=binjnodes, verbose=verbose) # # coincidence analysis # coinc_nodes = [] for n, (time_slides_cache_entry, these_time_slides) in enumerate(time_slides.items()): if verbose: print("%s %d/%d (%s):" % (tag, n + 1, len(time_slides), time_slides_cache_entry.path), file=sys.stderr) coinc_nodes.append(set()) # # lalapps_cafe & ligolw_add # tisi_cache = set([time_slides_cache_entry]) lladd_nodes = set() for segnum, (seg, parents, cache, clipseg) in enumerate( power.group_coinc_parents(trigger_nodes, these_time_slides, extentlimit=150000000.0 / (len(these_time_slides) or 1), verbose=verbose)): binj_cache = set(cache_entry for node in binjnodes for cache_entry in node.get_output_cache() if cache_entry.segment.intersects(seg)) # otherwise too many copies of the offset vector # will be fed into burca assert len(binj_cache) < 2 if do_injections: # lalapps_binj has already copied the time # slide document into its own output extra_input_cache = vetoes_cache else: # ligolw_add needs to copy the time slide # document into its output extra_input_cache = tisi_cache | vetoes_cache these_lladd_nodes = power.make_lladd_fragment( dag, parents | binjnodes, "%s_%d_%x" % (tag, n, segnum), segment=seg, input_cache=cache | binj_cache | segments_cache, extra_input_cache=extra_input_cache, remove_input=do_injections and clipseg is not None, preserve_cache=binj_cache | segments_cache | tisi_cache | vetoes_cache) if clipseg is not None: # # this is a fragment of a too-large burca # job, construct it specially and add the # command-line option needed to clip the # output # assert len(these_lladd_nodes) == 1 coinc_nodes[-1] |= power.make_burca_fragment( dag, these_lladd_nodes, "%s_%d" % (tag, n), coincidence_segments=segments.segmentlist([clipseg]), verbose=verbose) else: # # this is not a fragment of a too-large # burca job, add it to the pool of files to # be processed by the burcas that don't # require special clipping command line # options # lladd_nodes |= these_lladd_nodes # # lalapps_burca pool. these are the burca jobs that don't # require special clipping command line options, and so can # bulk-process many files with each job # if verbose: print("building burca jobs ...", file=sys.stderr) coinc_nodes[-1] |= power.make_burca_fragment(dag, lladd_nodes, "%s_%d" % (tag, n), verbose=verbose) if verbose: print("done %s %d/%d" % (tag, n + 1, len(time_slides)), file=sys.stderr) # # lalapps_binjfind # if do_injections: if verbose: print("building binjfind jobs ...", file=sys.stderr) coinc_nodes = [ power.make_binjfind_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose=verbose) for n, these_coinc_nodes in enumerate(coinc_nodes) ] # # ligolw_sqlite and lalapps_run_sqlite # if verbose: print("building sqlite jobs ...", file=sys.stderr) coinc_nodes = [ power.make_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), verbose=verbose) for n, these_coinc_nodes in enumerate(coinc_nodes) ] coinc_nodes = [ cosmicstring.make_run_sqlite_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n), clipsegments_sql_filename) for n, these_coinc_nodes in enumerate(coinc_nodes) ] # # lalapps_string_meas_likelihood # if verbose: print("building lalapps_string_meas_likelihood jobs ...", file=sys.stderr) likelihood_nodes = [ cosmicstring.make_meas_likelihood_fragment(dag, these_coinc_nodes, "%s_%d" % (tag, n)) for n, these_coinc_nodes in enumerate(coinc_nodes) ] # # write output cache # if verbose: print("writing output cache ...", file=sys.stderr) for n, (these_coinc_nodes, these_likelihood_nodes) in enumerate( zip(coinc_nodes, likelihood_nodes)): power.write_output_cache( these_coinc_nodes | these_likelihood_nodes, "%s_%s_output.cache" % (os.path.splitext(dag.get_dag_file())[0], "%s_%d" % (tag, n))) # # done # return coinc_nodes, likelihood_nodes
def thinca_coinc(ifo_list, single_data_analyzed, cafe_caches, cafe_base, lladd_job, tisi_file_name, lladd_veto_file, coinc_job, dag, do_coinc, do_insp, usertag=None, inspinjNode=None): """ Run thinca on the coincident times from each of the sets of IFOs. Since the way we treat all this data is the same, this function is the same for all. ifo_list = a list of the ifos we are to analyze single_data_analyzed = dictionary of single ifo data analyzed cafe_caches = the caches from ligolw_cafe.ligolw_cafe() cafe_base = the base name for the cafe caches lladd_job = the condor job to do ligolw_add tisi_file_name = the name of the tisi file to add lladd_veto_file = the name of the veto file to add or None coinc_job = the condor job to do thinca dag = the DAG to attach the nodes to do_coinc = whether we should add the thinca jobs to the dag do_insp = whether previous inspiral jobs are in the dag usertag = the usertag to add to the output file name inspinjNode = the inspinj node to be added as a parent to ligolw_add jobs """ # create caches using ligolw_cafe cache_names = ligolw_cafe.write_caches(cafe_base, cafe_caches, set(ifo_list)) coinc_analyzed = [] # loop over caches for idx in range(len(cafe_caches)): if len(cafe_caches[idx].objects): cache = cafe_caches[idx] cachename = cache_names[idx] thincabase = cafe_base.split('.')[0].replace('CAFE_','') ifos = set(cache_entry.observatory for cache_entry in cache.objects) # extract segment information seg = power.cache_span(cache.objects) seg = pipeline.AnalysisChunk(seg[0],seg[1]) # create node for ligolw_add to create xml file lladd = pipeline.LigolwAddNode(lladd_job) # add the tisi and veto files lladd.add_file_arg(tisi_file_name) if lladd_veto_file: lladd.add_file_arg(lladd_veto_file) # add the input xml files from the cafe cache cachefp = open(cachename,'r') cacheobject = lal.Cache().fromfile(cachefp) cachefp.close() cachepfns = cacheobject.pfnlist() for pfn in cachepfns: lladd.add_file_arg(pfn) # create node for ligolw_thinca to analyze xml file thinca = inspiral.ThincaNode(coinc_job) thinca.set_start(seg.start(), pass_to_command_line=False) thinca.set_end(seg.end(), pass_to_command_line=False) thinca.set_zip_output(True) if usertag: thinca.set_user_tag(thincabase, pass_to_command_line=False) # check if caches are adjacent coinc_end_time_segment = '' if idx and (cache.extent[0] == cafe_caches[idx-1].extent[1]): coinc_end_time_segment += str(cache.extent[0]) coinc_end_time_segment += ':' if idx + 1 - len(cafe_caches) and (cache.extent[1] == cafe_caches[idx+1].extent[0]): coinc_end_time_segment += str(cache.extent[1]) thinca.add_var_opt('coinc-end-time-segment',coinc_end_time_segment) # scroll through ifos, adding the appropriate ones for ifo in ifo_list: if ifo in ifos: thinca.set_ifo(ifo, pass_to_command_line=False) # add all inspiral jobs in this cache to input if do_insp: for cache_entry in cache.objects: lladd.add_parent(single_data_analyzed[cache_entry]) # add inspinj job as parent of each ligolw_add job if inspinjNode and opts.inspinj: lladd.add_parent(inspinjNode) # set output of ligolw_add jobs to follow thinca's convention lladd_outfile = re.sub('THINCA','LLWADD',thinca.get_output()) lladd.set_output(lladd_outfile) thinca.set_input(lladd.get_output(), pass_to_command_line=False) thinca.add_file_arg(lladd.get_output()) # check for condor settings if not opts.disable_dag_categories: lladd.set_category('ligolw_add') thinca.set_category('thinca') if not opts.disable_dag_priorities: lladd.set_priority(3) thinca.set_priority(3) # add ligolw_add and ligolw_thinca nodes to dag if do_coinc: dag.add_node(lladd) thinca.add_parent(lladd) dag.add_node(thinca) # add ligolw_thinca coincident segment coinc_analyzed.append(AnalyzedIFOData(seg,thinca)) return coinc_analyzed