def setup_background_bins_inj(workflow, coinc_files, background_file, bank_file, out_dir, tags=None): tags = [] if tags is None else tags bins_exe = PyCBCDistributeBackgroundBins(workflow.cp, 'distribute_background_bins', ifos=workflow.ifos, tags=tags, out_dir=out_dir) statmap_exe = PyCBCStatMapInjExecutable(workflow.cp, 'statmap_inj', ifos=workflow.ifos, tags=tags, out_dir=out_dir) cstat_exe = PyCBCCombineStatmap(workflow.cp, 'combine_statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) background_bins = workflow.cp.get_opt_tags('workflow-coincidence', 'background-bins', tags).split(' ') background_bins = [x for x in background_bins if x != ''] for inj_type in ['injinj', 'injfull', 'fullinj']: bins_node = bins_exe.create_node(FileList(coinc_files[inj_type]), bank_file, background_bins, tags=tags + [inj_type]) workflow += bins_node coinc_files[inj_type] = bins_node.output_files statmap_files = FileList([]) for i in range(len(background_bins)): statnode = statmap_exe.create_node(FileList([coinc_files['injinj'][i]]), FileList([background_file[i]]), FileList([coinc_files['injfull'][i]]), FileList([coinc_files['fullinj'][i]]), tags=tags + ['BIN_%s' % i]) workflow += statnode statmap_files.append(statnode.output_files[0]) cstat_node = cstat_exe.create_node(statmap_files, tags=tags) workflow += cstat_node return cstat_node.output_files[0]
def create_node(self, trig_files=None, segment_dir=None, out_tags=[], tags=[]): node = Node(self) if not trig_files: raise ValueError("%s must be supplied with trigger files" % self.name) # Data options pad_data = self.cp.get('inspiral', 'pad-data') if pad_data is None: raise ValueError("The option pad-data is a required option of " "%s. Please check the ini file." % self.name) num_trials = int(self.cp.get("trig_combiner", "num-trials")) trig_name = self.cp.get('workflow', 'trigger-name') node.add_opt('--grb-name', trig_name) node.add_opt('--pad-data', pad_data) node.add_opt('--segment-length', self.cp.get('inspiral', 'segment-duration')) node.add_opt('--ifo-tag', self.ifos) node.add_opt('--user-tag', 'INSPIRAL') # Set input / output options node.add_input_list_opt('--input-files', trig_files) node.add_opt('--segment-dir', segment_dir) node.add_opt('--output-dir', self.out_dir) out_files = FileList([]) for out_tag in out_tags: out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, out_tag], store_file=self.retain_files) #out_file.PFN(out_file.cache_entry.path, site="local") out_files.append(out_file) for trial in range(1, num_trials + 1): out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, "OFFTRIAL_%d" % trial], store_file=self.retain_files) #out_file.PFN(out_file.cache_entry.path, site="local") out_files.append(out_file) node.add_profile('condor', 'request_cpus', self.num_threads) return node, out_files
def create_node(self, parent, inj_trigs, inj_string, max_inc, segment): node = Node(self) trig_name = self.cp.get("workflow", "trigger-name") node.add_opt("--inj-string", inj_string) node.add_opt("--max-inclination", max_inc) node.add_opt("--inj-cache", "%s" % parent.storage_path) out_files = FileList([]) for inj_trig in inj_trigs: out_string = inj_string.split(max_inc)[0] out_file_tag = [out_string, "FILTERED", max_inc, inj_trig.tag_str.rsplit("_", 1)[-1]] out_file = File( self.ifos, inj_trig.description, inj_trig.segment, extension="xml", directory=self.out_dir, tags=out_file_tag, ) out_file.PFN(out_file.cache_entry.path, site="local") out_files.append(out_file) node.add_opt("--output-dir", self.out_dir) return node, out_files
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ datafind_filelist = FileList([]) prev_file = None for cache in datafindcache_list: curr_ifo = cache.ifo for frame in cache: # Don't add a new workflow file entry for this frame if # if is a duplicate. These are assumed to be returned in time # order if prev_file and prev_file.cache_entry.url == frame.url: continue currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) currFile.PFN(frame.path, site='local') datafind_filelist.append(currFile) prev_file = currFile return datafind_filelist
def create_node(self, parent, inj_trigs, inj_string, max_inc, segment): node = Node(self) trig_name = self.cp.get('workflow', 'trigger-name') node.add_opt('--inj-string', inj_string) node.add_opt('--max-inclination', max_inc) node.add_opt('--inj-cache', '%s' % parent.storage_path) out_files = FileList([]) for inj_trig in inj_trigs: out_file_tag = [ inj_string, "FILTERED", max_inc, inj_trig.tag_str.rsplit('_', 1)[-1] ] out_file = File(self.ifos, inj_trig.description, inj_trig.segment, extension="xml", directory=self.out_dir, tags=out_file_tag) out_file.PFN(out_file.cache_entry.path, site="local") out_files.append(out_file) node.add_opt('--output-dir', self.out_dir) return node, out_files
def create_node(self, parent, inj_trigs, inj_string, max_inc, segment): node = Node(self) trig_name = self.cp.get('workflow', 'trigger-name') node.add_opt('--inj-string', inj_string) node.add_opt('--max-inclination', max_inc) node.add_opt('--inj-cache', '%s' % parent.storage_path) out_files = FileList([]) for inj_trig in inj_trigs: out_file_tag = [ inj_string, "FILTERED", max_inc, inj_trig.tag_str.rsplit('_', 1)[-1] ] out_file = File( self.ifos, inj_trig.description, inj_trig.segment, extension="xml", directory=self.out_dir, tags=out_file_tag) out_file.PFN(out_file.cache_entry.path, site="local") out_files.append(out_file) node.add_opt('--output-dir', self.out_dir) return node, out_files
def setup_background_bins(workflow, coinc_files, bank_file, out_dir, tags=None): tags = [] if tags is None else tags bins_exe = PyCBCDistributeBackgroundBins(workflow.cp, 'distribute_background_bins', ifos=workflow.ifos, tags=tags, out_dir=out_dir) statmap_exe = PyCBCStatMapExecutable(workflow.cp, 'statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) cstat_exe = PyCBCCombineStatmap(workflow.cp, 'combine_statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) background_bins = workflow.cp.get_opt_tags('workflow-coincidence', 'background-bins', tags).split(' ') background_bins = [x for x in background_bins if x != ''] bins_node = bins_exe.create_node(coinc_files, bank_file, background_bins) workflow += bins_node stat_files = FileList([]) for i, coinc_file in enumerate(bins_node.output_files): statnode = statmap_exe.create_node(FileList([coinc_file]), tags=tags + ['BIN_%s' % i]) workflow += statnode stat_files.append(statnode.output_files[0]) stat_files[i].bin_name = bins_node.names[i] cstat_node = cstat_exe.create_node(stat_files, tags=tags) workflow += cstat_node return cstat_node.output_files[0], stat_files
def setup_background_bins(workflow, coinc_files, bank_file, out_dir, tags=None): tags = [] if tags is None else tags bins_exe = PyCBCDistributeBackgroundBins(workflow.cp, 'distribute_background_bins', ifos=workflow.ifos, tags=tags, out_dir=out_dir) statmap_exe = PyCBCStatMapExecutable(workflow.cp, 'statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) cstat_exe = PyCBCCombineStatmap(workflow.cp, 'combine_statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) background_bins = workflow.cp.get_opt_tags('workflow-coincidence', 'background-bins', tags).split(' ') background_bins = [x for x in background_bins if x != ''] bins_node = bins_exe.create_node(coinc_files, bank_file, background_bins) workflow += bins_node statmap_files = FileList([]) for i, coinc_file in enumerate(bins_node.output_files): statnode = statmap_exe.create_node(FileList([coinc_file]), tags=tags + ['BIN_%s' % i]) workflow += statnode statmap_files.append(statnode.output_files[0]) statmap_files[i].bin_name = bins_node.names[i] cstat_node = cstat_exe.create_node(statmap_files, tags=tags) workflow += cstat_node return cstat_node.output_files[0], statmap_files
def setup_background_bins_inj(workflow, coinc_files, background_file, bank_file, out_dir, tags=None): tags = [] if tags is None else tags bins_exe = PyCBCDistributeBackgroundBins(workflow.cp, 'distribute_background_bins', ifos=workflow.ifos, tags=tags, out_dir=out_dir) statmap_exe = PyCBCStatMapInjExecutable(workflow.cp, 'statmap_inj', ifos=workflow.ifos, tags=tags, out_dir=out_dir) cstat_exe = PyCBCCombineStatmap(workflow.cp, 'combine_statmap', ifos=workflow.ifos, tags=tags, out_dir=out_dir) background_bins = workflow.cp.get_opt_tags('workflow-coincidence', 'background-bins', tags).split(' ') background_bins = [x for x in background_bins if x != ''] for inj_type in ['injinj', 'injfull', 'fullinj']: bins_node = bins_exe.create_node(FileList(coinc_files[inj_type]), bank_file, background_bins, tags=tags + [inj_type]) workflow += bins_node coinc_files[inj_type] = bins_node.output_files stat_files = FileList([]) for i in range(len(background_bins)): statnode = statmap_exe.create_node(FileList([coinc_files['injinj'][i]]), FileList([background_file[i]]), FileList([coinc_files['injfull'][i]]), FileList([coinc_files['fullinj'][i]]), tags=tags + ['BIN_%s' % i]) workflow += statnode stat_files.append(statnode.output_files[0]) cstat_node = cstat_exe.create_node(stat_files, tags=tags) workflow += cstat_node return cstat_node.output_files[0]
def setup_psd_pregenerated(workflow, tags=[]): ''' Setup CBC workflow to use pregenerated psd files. The file given in cp.get('workflow','pregenerated-psd-file-(ifo)') will be used as the --psd-file argument to geom_nonspinbank, geom_aligned_bank and pycbc_plot_psd_file. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- psd_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' psd_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_PSD" # Check for one psd for all ifos try: pre_gen_file = cp.get_opt_tags('workflow-psd', 'psd-pregenerated-file', tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(workflow.ifos, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') psd_files.append(curr_file) except ConfigParser.Error: # Check for one psd per ifo for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags('workflow-psd', 'psd-pregenerated-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') psd_files.append(curr_file) except ConfigParser.Error: # It's unlikely, but not impossible, that only some ifos # will have pregenerated PSDs logging.warn("No psd file specified for IFO %s." % (ifo,)) pass return psd_files
def setup_psd_pregenerated(workflow, tags=None): ''' Setup CBC workflow to use pregenerated psd files. The file given in cp.get('workflow','pregenerated-psd-file-(ifo)') will be used as the --psd-file argument to geom_nonspinbank, geom_aligned_bank and pycbc_plot_psd_file. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- psd_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' if tags is None: tags = [] psd_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_PSD" # Check for one psd for all ifos try: pre_gen_file = cp.get_opt_tags('workflow-psd', 'psd-pregenerated-file', tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urljoin('file:', pathname2url(pre_gen_file)) curr_file = File(workflow.ifos, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') psd_files.append(curr_file) except ConfigParser.Error: # Check for one psd per ifo for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags('workflow-psd', 'psd-pregenerated-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urljoin('file:', pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') psd_files.append(curr_file) except ConfigParser.Error: # It's unlikely, but not impossible, that only some ifos # will have pregenerated PSDs logging.warn("No psd file specified for IFO %s." % (ifo,)) pass return psd_files
def create_node(self, trig_files=None, segment_dir=None, analysis_seg=None, out_tags=[], tags=[]): node = Node(self) if not trig_files: raise ValueError("%s must be supplied with trigger files" % self.name) # Data options num_trials = int(self.cp.get("trig_combiner", "num-trials")) trig_name = self.cp.get('workflow', 'trigger-name') if all("COHERENT_NO_INJECTIONS" in t.name for t in trig_files) and \ self.cp.has_option_tag('inspiral', 'do-short-slides', 'coherent_no_injections'): node.add_opt('--short-slides') node.add_opt('--grb-name', trig_name) node.add_opt('--trig-start-time', analysis_seg[0]) node.add_opt('--ifo-tag', self.ifos) node.add_opt('--user-tag', 'INSPIRAL') # Set input / output options node.add_input_list_opt('--input-files', trig_files) node.add_opt('--segment-dir', segment_dir) node.add_opt('--output-dir', self.out_dir) out_files = FileList([]) for out_tag in out_tags: out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, out_tag], store_file=self.retain_files) out_files.append(out_file) for trial in range(1, num_trials + 1): out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, "OFFTRIAL_%d" % trial], store_file=self.retain_files) out_files.append(out_file) node.add_profile('condor', 'request_cpus', self.num_threads) return node, out_files
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ datafind_filelist = FileList([]) prev_file = None for cache in datafindcache_list: curr_ifo = cache.ifo for frame in cache: # Don't add a new workflow file entry for this frame if # if is a duplicate. These are assumed to be returned in time # order if prev_file: prev_name = prev_file.cache_entry.url.split('/')[-1] this_name = frame.url.split('/')[-1] if prev_name == this_name: continue # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost', 'file://') currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) if frame.url.startswith('file://'): currFile.PFN(frame.url, site='local') if frame.url.startswith( 'file:///cvmfs/oasis.opensciencegrid.org/'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. currFile.PFN(frame.url, site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'root://xrootd-local.unl.edu/user/'), site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://red-gridftp.unl.edu/user/'), site='osg') else: currFile.PFN(frame.url, site='notlocal') datafind_filelist.append(currFile) prev_file = currFile return datafind_filelist
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ datafind_filelist = FileList([]) prev_file = None for cache in datafindcache_list: curr_ifo = cache.ifo for frame in cache: # Don't add a new workflow file entry for this frame if # if is a duplicate. These are assumed to be returned in time # order if prev_file: prev_name = prev_file.cache_entry.url.split('/')[-1] this_name = frame.url.split('/')[-1] if prev_name == this_name: continue # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost','file://') currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) if frame.url.startswith('file://'): currFile.PFN(frame.url, site='local') if frame.url.startswith( 'file:///cvmfs/oasis.opensciencegrid.org/'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. currFile.PFN(frame.url, site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'root://xrootd-local.unl.edu/user/'), site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://red-gridftp.unl.edu/user/'), site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://ldas-grid.ligo.caltech.edu/hdfs/'), site='osg') else: currFile.PFN(frame.url, site='notlocal') datafind_filelist.append(currFile) prev_file = currFile return datafind_filelist
def create_node(self, trig_files=None, segment_dir=None, out_tags=[], tags=[]): node = Node(self) if not trig_files: raise ValueError("%s must be supplied with trigger files" % self.name) # Data options pad_data = self.cp.get('inspiral', 'pad-data') if pad_data is None: raise ValueError("The option pad-data is a required option of " "%s. Please check the ini file." % self.name) num_trials = int(self.cp.get("trig_combiner", "num-trials")) trig_name = self.cp.get('workflow', 'trigger-name') if all("COHERENT_NO_INJECTIONS" in t.name for t in trig_files) and \ self.cp.has_option_tag('inspiral', 'do-short-slides', 'coherent_no_injections'): node.add_opt('--short-slides') node.add_opt('--grb-name', trig_name) node.add_opt('--pad-data', pad_data) node.add_opt('--segment-length', self.cp.get('inspiral', 'segment-duration')) node.add_opt('--ifo-tag', self.ifos) node.add_opt('--user-tag', 'INSPIRAL') # Set input / output options node.add_input_list_opt('--input-files', trig_files) node.add_opt('--segment-dir', segment_dir) node.add_opt('--output-dir', self.out_dir) out_files = FileList([]) for out_tag in out_tags: out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, out_tag], store_file=self.retain_files) out_files.append(out_file) for trial in range(1, num_trials + 1): out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=["GRB%s" % trig_name, "OFFTRIAL_%d" % trial], store_file=self.retain_files) out_files.append(out_file) node.add_profile('condor', 'request_cpus', self.num_threads) return node, out_files
def setup_gate_pregenerated(workflow, output_dir=None, tags=None): ''' Setup CBC workflow to use pregenerated gating files. The file given in cp.get('workflow','gating-file-(ifo)') will be used as the --gating-file for all jobs for that ifo. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. output_dir : path string The directory where data products will be placed. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- gate_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' if tags is None: tags = [] gate_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_GATE" for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags('workflow-gating', 'gating-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file, os.path.join(os.getcwd(),output_dir)) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') gate_files.append(curr_file) logging.info("Using gating file %s for %s", file_url, ifo) except ConfigParser.Error: logging.info("No gating file specified for %s", ifo) return gate_files
def setup_gate_pregenerated(workflow, output_dir=None, tags=None): ''' Setup CBC workflow to use pregenerated gating files. The file given in cp.get('workflow','gating-file-(ifo)') will be used as the --gating-file for all jobs for that ifo. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. output_dir : path string The directory where data products will be placed. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- gate_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' if tags is None: tags = [] gate_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_GATE" for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags('workflow-gating', 'gating-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file, os.path.join(os.getcwd(), output_dir)) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') gate_files.append(curr_file) logging.info("Using gating file %s for %s", file_url, ifo) except ConfigParser.Error: logging.info("No gating file specified for %s", ifo) return gate_files
def setup_gate_pregenerated(workflow, tags=None): ''' Setup CBC workflow to use pregenerated gating files. The file given in cp.get('workflow','pregenerated-gating-file-(ifo)') will be used as the --gating-file for all matched-filtering jobs for that ifo. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- gate_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' if tags is None: tags = [] gate_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_GATE" for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags( 'workflow-gating', 'gating-pregenerated-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') gate_files.append(curr_file) except ConfigParser.Error: # It's unlikely, but not impossible, that only some ifos # will be gated logging.warn("No gating file specified for IFO %s." % (ifo, )) pass return gate_files
def setup_gate_pregenerated(workflow, tags=[]): ''' Setup CBC workflow to use pregenerated gating files. The file given in cp.get('workflow','pregenerated-gating-file-(ifo)') will be used as the --gating-file for all matched-filtering jobs for that ifo. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- gate_files : pycbc.workflow.core.FileList The FileList holding the gating files ''' gate_files = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_GATE" for ifo in workflow.ifos: try: pre_gen_file = cp.get_opt_tags('workflow-gating', 'gating-pregenerated-file-%s' % ifo.lower(), tags) pre_gen_file = resolve_url(pre_gen_file) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_file)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') gate_files.append(curr_file) except ConfigParser.Error: # It's unlikely, but not impossible, that only some ifos # will be gated logging.warn("No gating file specified for IFO %s." % (ifo,)) pass return gate_files
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ datafind_filelist = FileList([]) prev_file = None for cache in datafindcache_list: curr_ifo = cache.ifo for frame in cache: # Don't add a new workflow file entry for this frame if # if is a duplicate. These are assumed to be returned in time # order if prev_file: prev_name = prev_file.cache_entry.url.split('/')[-1] this_name = frame.url.split('/')[-1] if prev_name == this_name: continue # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost', 'file://') currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) if frame.url.startswith('file://'): currFile.PFN(frame.url, site='local') else: currFile.PFN(frame.url, site='notlocal') datafind_filelist.append(currFile) prev_file = currFile return datafind_filelist
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ datafind_filelist = FileList([]) prev_file = None for cache in datafindcache_list: curr_ifo = cache.ifo for frame in cache: # Don't add a new workflow file entry for this frame if # if is a duplicate. These are assumed to be returned in time # order if prev_file: prev_name = prev_file.cache_entry.url.split('/')[-1] this_name = frame.url.split('/')[-1] if prev_name == this_name: continue # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost','file://') currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) if frame.url.startswith('file://'): currFile.PFN(frame.url, site='local') else: currFile.PFN(frame.url, site='notlocal') datafind_filelist.append(currFile) prev_file = currFile return datafind_filelist
def datafind_keep_unique_backups(backup_outs, orig_outs): """This function will take a list of backup datafind files, presumably obtained by querying a remote datafind server, e.g. CIT, and compares these against a list of original datafind files, presumably obtained by querying the local datafind server. Only the datafind files in the backup list that do not appear in the original list are returned. This allows us to use only files that are missing from the local cluster. Parameters ----------- backup_outs : FileList List of datafind files from the remote datafind server. orig_outs : FileList List of datafind files from the local datafind server. Returns -------- FileList List of datafind files in backup_outs and not in orig_outs. """ # NOTE: This function is not optimized and could be made considerably # quicker if speed becomes in issue. With 4s frame files this might # be slow, but for >1000s files I don't foresee any issue, so I keep # this simple. return_list = FileList([]) # We compare the LFNs to determine uniqueness # Is there a way to associate two paths with one LFN?? orig_names = [f.name for f in orig_outs] for file in backup_outs: if file.name not in orig_names: return_list.append(file) else: index_num = orig_names.index(file.name) orig_out = orig_outs[index_num] pfns = list(file.pfns) # This shouldn't happen, but catch if it does assert(len(pfns) == 1) orig_out.PFN(pfns[0].url, site='notlocal') return return_list
def datafind_keep_unique_backups(backup_outs, orig_outs): """This function will take a list of backup datafind files, presumably obtained by querying a remote datafind server, e.g. CIT, and compares these against a list of original datafind files, presumably obtained by querying the local datafind server. Only the datafind files in the backup list that do not appear in the original list are returned. This allows us to use only files that are missing from the local cluster. Parameters ----------- backup_outs : FileList List of datafind files from the remote datafind server. orig_outs : FileList List of datafind files from the local datafind server. Returns -------- FileList List of datafind files in backup_outs and not in orig_outs. """ # NOTE: This function is not optimized and could be made considerably # quicker if speed becomes in issue. With 4s frame files this might # be slow, but for >1000s files I don't foresee any issue, so I keep # this simple. return_list = FileList([]) # We compare the LFNs to determine uniqueness # Is there a way to associate two paths with one LFN?? orig_names = [f.name for f in orig_outs] for file in backup_outs: if file.name not in orig_names: return_list.append(file) else: index_num = orig_names.index(file.name) orig_out = orig_outs[index_num] pfns = list(file.pfns) # This shouldn't happen, but catch if it does assert len(pfns) == 1 orig_out.PFN(pfns[0].url, site="notlocal") return return_list
def setup_segment_gen_mixed(workflow, veto_categories, out_dir, maxVetoAtRunTime, tag=None, generate_coincident_segs=True): """ This function will generate veto files for each ifo and for each veto category. It can generate these vetoes at run-time or in the workflow (or do some at run-time and some in the workflow). However, the CAT_1 vetoes and science time must be generated at run time as they are needed to plan the workflow. CATs 2 and higher *may* be needed for other workflow construction. It can also combine these files to create a set of cumulative, multi-detector veto files, which can be used in ligolw_thinca and in pipedown. Again these can be created at run time or within the workflow. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. This instance also contains the ifos for which to attempt to obtain segments for this analysis and the start and end times to search for segments over. veto_categories : list of ints List of veto categories to generate segments for. If this stops being integers, this can be changed here. out_dir : path The directory in which output will be stored. maxVetoAtRunTime : int Generate veto files at run time up to this category. Veto categories beyond this in veto_categories will be generated in the workflow. If we move to a model where veto categories are not explicitly cumulative, this will be rethought. tag : string, optional (default=None) Use this to specify a tag. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! generate_coincident_segs : boolean, optional (default = True) If given this module will generate a set of coincident, cumulative veto files that can be used with ligolw_thinca and pipedown. Returns ------- segFilesList : dictionary of pycbc.workflow.core.SegFile instances These are representations of the various segment files that were constructed at this stage of the workflow and may be needed at later stages of the analysis (e.g. for performing DQ vetoes). If the file was generated at run-time the segment lists contained within these files will be an attribute of the instance. (If it will be generated in the workflow it will not be because I am not psychic). """ cp = workflow.cp segFilesList = FileList([]) start_time = workflow.analysis_time[0] end_time = workflow.analysis_time[1] segValidSeg = workflow.analysis_time # Will I need to add some jobs to the workflow? vetoGenJob = create_segs_from_cats_job(cp, out_dir, workflow.ifo_string) for ifo in workflow.ifos: logging.info("Generating science segments for ifo %s" %(ifo)) currSciSegs, currSciXmlFile = get_science_segments(ifo, cp, start_time, end_time, out_dir, tag=tag) segFilesList.append(currSciXmlFile) for category in veto_categories: if category > maxVetoAtRunTime: msg = "Adding creation of CAT_%d segments " %(category) msg += "for ifo %s to workflow." %(ifo) logging.info(msg) execute_status = False if category <= maxVetoAtRunTime: logging.info("Generating CAT_%d segments for ifo %s." \ %(category,ifo)) execute_status = True currVetoXmlFile = get_veto_segs(workflow, ifo, category, start_time, end_time, out_dir, vetoGenJob, execute_now=execute_status) segFilesList.append(currVetoXmlFile) # Store the CAT_1 veto segs for use below if category == 1: # Yes its yucky to generate a file and then read it back in. #This will be # fixed when the new API for segment generation is ready. vetoXmlFP = open(currVetoXmlFile.storage_path, 'r') cat1Segs = fromsegmentxml(vetoXmlFP) vetoXmlFP.close() analysedSegs = currSciSegs - cat1Segs analysedSegs.coalesce() analysedXmlFile = os.path.join(out_dir, "%s-SCIENCE_OK_SEGMENTS.xml" %(ifo.upper()) ) currUrl = urlparse.urlunparse(['file', 'localhost', analysedXmlFile, None, None, None]) if tag: currTags = [tag, 'SCIENCE_OK'] else: currTags = ['SCIENCE_OK'] currFile = OutSegFile(ifo, 'SEGMENTS', segValidSeg, currUrl, segment_list=analysedSegs, tags = currTags) segFilesList.append(currFile) currFile.toSegmentXml() if generate_coincident_segs: # Need to make some combined category veto files to use when vetoing # segments and triggers. ifo_string = workflow.ifo_string categories = [] cum_cat_files = [] for category in veto_categories: categories.append(category) # Set file name in workflow standard if tag: currTags = [tag, 'CUMULATIVE_CAT_%d' %(category)] else: currTags = ['CUMULATIVE_CAT_%d' %(category)] cumulativeVetoFile = os.path.join(out_dir, '%s-CUMULATIVE_CAT_%d_VETO_SEGMENTS.xml' \ %(ifo_string, category) ) currUrl = urlparse.urlunparse(['file', 'localhost', cumulativeVetoFile, None, None, None]) currSegFile = OutSegFile(ifo_string, 'SEGMENTS', segValidSeg, currUrl, tags=currTags) # And actually make the file (or queue it in the workflow) logging.info("Generating combined, cumulative CAT_%d segments."\ %(category)) if category <= maxVetoAtRunTime: execute_status = True else: execute_status = False get_cumulative_segs(workflow, currSegFile, categories, segFilesList, out_dir, execute_now=execute_status) segFilesList.append(currSegFile) cum_cat_files.append(currSegFile) # Create a combined file # Set file tag in workflow standard if tag: currTags = [tag, 'COMBINED_CUMULATIVE_SEGMENTS'] else: currTags = ['COMBINED_CUMULATIVE_SEGMENTS'] combined_veto_file = os.path.join(out_dir, '%s-CUMULATIVE_ALL_CATS_SEGMENTS.xml' \ %(ifo_string) ) curr_url = urlparse.urlunparse(['file', 'localhost', combined_veto_file, None, None, None]) curr_file = OutSegFile(ifo_string, 'SEGMENTS', segValidSeg, curr_url, tags=currTags) for category in veto_categories: if category <= maxVetoAtRunTime: execute_status = True break else: execute_status = False add_cumulative_files(workflow, curr_file, cum_cat_files, out_dir, execute_now=execute_status) segFilesList.append(curr_file) return segFilesList
def get_cumulative_veto_group_files(workflow, option, out_dir, tags=[]): """ Get the cumulative veto files that define the different backgrounds we want to analyze, defined by groups of vetos. Parameters ----------- workflow : Workflow object Instance of the workflow object option : str ini file option to use to get the veto groups out_dir : path Location to store output files tags : list of strings Used to retrieve subsections of the ini file for configuration options. Returns -------- seg_files : workflow.core.FileList instance The cumulative segment files for each veto group. cat_files : workflow.core.FileList instance The list of individual category veto files """ make_analysis_dir(out_dir) start_time = workflow.analysis_time[0] end_time = workflow.analysis_time[1] cat_sets = parse_cat_ini_opt(workflow.cp.get_opt_tags('workflow-segments', option, tags)) veto_gen_job = create_segs_from_cats_job(workflow.cp, out_dir, workflow.ifo_string) cats = set() for cset in cat_sets: cats = cats.union(cset) cat_files = FileList() for ifo in workflow.ifos: for category in cats: cat_files.append(get_veto_segs(workflow, ifo, cat_to_pipedown_cat(category), start_time, end_time, out_dir, veto_gen_job, execute_now=True)) cum_seg_files = FileList() names = [] for cat_set in cat_sets: segment_name = "CUMULATIVE_CAT_%s" % (''.join(sorted(cat_set))) logging.info('getting information for %s' % segment_name) categories = [cat_to_pipedown_cat(c) for c in cat_set] path = os.path.join(out_dir, '%s-%s_VETO_SEGMENTS.xml' \ % (workflow.ifo_string, segment_name)) path = os.path.abspath(path) url = urlparse.urlunparse(['file', 'localhost', path, None, None, None]) seg_file = File(workflow.ifos, 'CUM_VETOSEGS', workflow.analysis_time, file_url=url, tags=[segment_name]) cum_seg_files += [get_cumulative_segs(workflow, seg_file, categories, cat_files, out_dir, execute_now=True, segment_name=segment_name)] names.append(segment_name) return cum_seg_files, names, cat_files
def setup_injection_workflow(workflow, output_dir=None, inj_section_name='injections', exttrig_file=None, tags=None): """ This function is the gateway for setting up injection-generation jobs in a workflow. It should be possible for this function to support a number of different ways/codes that could be used for doing this, however as this will presumably stay as a single call to a single code (which need not be inspinj) there are currently no subfunctions in this moudle. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. output_dir : path The directory in which injection files will be stored. inj_section_name : string (optional, default='injections') The string that corresponds to the option describing the exe location in the [executables] section of the .ini file and that corresponds to the section (and sub-sections) giving the options that will be given to the code at run time. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. This will be used in output names. Returns -------- inj_files : pycbc.workflow.core.FileList The list of injection files created by this call. inj_tags : list of strings The tag corresponding to each injection file and used to uniquely identify them. The FileList class contains functions to search based on tags. """ if tags is None: tags = [] logging.info("Entering injection module.") make_analysis_dir(output_dir) # Get full analysis segment for output file naming full_segment = workflow.analysis_time ifos = workflow.ifos # Identify which injections to do by presence of sub-sections in # the configuration file inj_tags = [] inj_files = FileList([]) for section in workflow.cp.get_subsections(inj_section_name): inj_tag = section.upper() curr_tags = tags + [inj_tag] # Parse for options in ini file injection_method = workflow.cp.get_opt_tags("workflow-injections", "injections-method", curr_tags) if injection_method in ["IN_WORKFLOW", "AT_RUNTIME"]: # FIXME: Add ability to specify different exes inj_job = LalappsInspinjExecutable(workflow.cp, inj_section_name, out_dir=output_dir, ifos='HL', tags=curr_tags) node = inj_job.create_node(full_segment) if injection_method == "AT_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] inj_files.append(inj_file) elif injection_method == "PREGENERATED": file_attrs = { 'ifos': ['HL'], 'segs': full_segment, 'tags': curr_tags } injection_path = workflow.cp.get_opt_tags( "workflow-injections", "injections-pregenerated-file", curr_tags) curr_file = resolve_url_to_file(injection_path, attrs=file_attrs) inj_files.append(curr_file) elif injection_method in ["IN_COH_PTF_WORKFLOW", "AT_COH_PTF_RUNTIME"]: inj_job = LalappsInspinjExecutable(workflow.cp, inj_section_name, out_dir=output_dir, ifos=ifos, tags=curr_tags) node = inj_job.create_node(full_segment, exttrig_file) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "em-bright-only"): em_filter_job = PycbcDarkVsBrightInjectionsExecutable( workflow.cp, 'em_bright_filter', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = em_filter_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "do-jitter-skyloc"): jitter_job = LigolwCBCJitterSkylocExecutable( workflow.cp, 'jitter_skyloc', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = jitter_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "do-align-total-spin"): align_job = LigolwCBCAlignTotalSpinExecutable( workflow.cp, 'align_total_spin', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = align_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] inj_files.append(inj_file) else: err = "Injection method must be one of IN_WORKFLOW, " err += "AT_RUNTIME or PREGENERATED. Got %s." % (injection_method) raise ValueError(err) inj_tags.append(inj_tag) logging.info("Leaving injection module.") return inj_files, inj_tags
def setup_tmpltbank_pregenerated(workflow, tags=None): ''' Setup CBC workflow to use a pregenerated template bank. The bank given in cp.get('workflow','pregenerated-template-bank') will be used as the input file for all matched-filtering jobs. If this option is present, workflow will assume that it should be used and not generate template banks within the workflow. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- tmplt_banks : pycbc.workflow.core.FileList The FileList holding the details of the template bank. ''' if tags is None: tags = [] # Currently this uses the *same* fixed bank for all ifos. # Maybe we want to add capability to analyse separate banks in all ifos? # Set up class for holding the banks tmplt_banks = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_TMPLTBANK" try: # First check if we have a bank for all ifos pre_gen_bank = cp.get_opt_tags('workflow-tmpltbank', 'tmpltbank-pregenerated-bank', tags) pre_gen_bank = resolve_url(pre_gen_bank) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_bank)) curr_file = File(workflow.ifos, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') tmplt_banks.append(curr_file) except ConfigParser.Error: # Okay then I must have banks for each ifo for ifo in workflow.ifos: try: pre_gen_bank = cp.get_opt_tags( 'workflow-tmpltbank', 'tmpltbank-pregenerated-bank-%s' % ifo.lower(), tags) pre_gen_bank = resolve_url(pre_gen_bank) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_bank)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') tmplt_banks.append(curr_file) except ConfigParser.Error: err_msg = "Cannot find pregerated template bank in section " err_msg += "[workflow-tmpltbank] or any tagged sections. " if tags: tagged_secs = " ".join("[workflow-tmpltbank-%s]" \ %(ifo,) for ifo in workflow.ifos) err_msg += "Tagged sections are %s. " % (tagged_secs, ) err_msg += "I looked for 'tmpltbank-pregenerated-bank' option " err_msg += "and 'tmpltbank-pregenerated-bank-%s'." % (ifo, ) raise ConfigParser.Error(err_msg) return tmplt_banks
def setup_postprocprep_pipedown_workflow(workflow, coincFiles, output_dir, tags=[], do_repop=False, injectionFiles=None, vetoFiles=None, injLessTag=None, injectionTags=[], veto_cats=[]): """ Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. coincFiles : pycbc.workflow.core.FileList An FileList of the coincident trigger files that are used as input at this stage. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. do_repop : Boolean If False, use the 'coinc_inspiral.snr' column from the coincident trigger files as clustering and ranking statistic; if True, use a repop_coinc job before clustering to calculate a different ranking statistic and store in the coinc_inspiral table for later use. injectionFiles : pycbc.workflow.core.FileList (optional, default=None) The injection files to be used in this stage. An empty list (or any other input that evaluates as false) is valid and will imply that no injections are being done. vetoFiles : pycbc.workflow.core.FileList (required) The data quality files to be used in this stage. This is required and will be used to determine the analysed times when doing post-processing. injLessTag : string (required) The tag that identifies files that do not have simulations in them. Ie. the primary search results. injectionTags : list of strings (optional, default = []) Each injection file has a unique tag. If used in the method, this tells the post-processing preparation code which injection tags it should include when creating the combined output. veto_cats : list of integers (optional, default = []) Decide which set of veto files should be used in the post-processing preparation. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- finalFiles : pycbc.workflow.core.FileList A list of the single SQL database storing the clustered, injection found, triggers for all injections, time slid and zero lag analyses. initialSqlFiles : pycbc.workflow.core.FileList The SQL files before clustering is applied and injection finding performed. clusteredSqlFiles : pycbc.workflow.core.FileList The clustered SQL files before injection finding performed. combinedSqlFiles : pycbc.workflow.core.FileList A combined file containing all triggers after clustering, including the injection and veto tables, but before injection finding performed. Probably there is no need to ever keep this file and it will be a temporary file in most cases. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") # Setup needed exe classes sqliteCombine1ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner1-exe", tags) sqliteCombine1Exe = select_generic_executable(workflow, sqliteCombine1ExeTag) sqliteCombine2ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner2-exe", tags) sqliteCombine2Exe = select_generic_executable(workflow, sqliteCombine2ExeTag) clusterCoincsExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-cluster-exe", tags) clusterCoincsExe = select_generic_executable(workflow, clusterCoincsExeTag) injFindExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-injfind-exe", tags) injFindExe = select_generic_executable(workflow, injFindExeTag) sqliteCombine1Outs = FileList([]) clusterCoincsOuts = FileList([]) injFindOuts = FileList([]) sqliteCombine2Outs = FileList([]) if do_repop: repopCoincExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-repop-exe", tags) repopCoincExe = select_generic_executable(workflow, repopCoincExeTag) repopCoincOuts = FileList([]) for cat in veto_cats: # FIXME: Some hacking is still needed while we support pipedown # FIXME: There are currently 3 names to say cumulative cat_3 vetoTag = 'CUMULATIVE_CAT_%d' %(cat) dqSegFile = vetoFiles.find_output_with_tag(vetoTag) if not len(dqSegFile) == 1: errMsg = "Did not find exactly 1 data quality file." raise ValueError(errMsg) # Don't think this is used here, this is the tag *in* the file dqVetoName = 'VETO_CAT%d_CUMULATIVE' %(cat) # FIXME: Here we set the dqVetoName to be compatible with pipedown pipedownDQVetoName = 'CAT_%d_VETO' %(cat) sqliteCombine2Inputs = FileList([]) # Do injection-less jobs first. # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injLessTag, vetoTag] trigVetoInpFiles = coincFiles.find_output_with_tag(pipedownDQVetoName) trigInpFiles = trigVetoInpFiles.find_output_with_tag(injLessTag) if len(trigInpFiles) == 0: err_msg = "No input files found. Workflow would fail." raise ValueError(err_msg) trigInpFiles.append(dqSegFile[0]) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node(workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Do injection jobs for injTag in injectionTags: # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injTag, vetoTag] trigInpFiles = trigVetoInpFiles.find_output_with_tag(injTag) trigInpFiles.append(dqSegFile[0]) injFile = injectionFiles.find_output_with_tag(injTag) assert (len(injFile) == 1) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, injFile=injFile[0], injString=injTag, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node( workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Choose a new label for pegasus-clustering the jobs job_label = get_random_label() # Combine everything together and add veto file currTags = tags + [vetoTag] sqliteCombine2Job = sqliteCombine2Exe(workflow.cp, sqliteCombine2ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine2Node = sqliteCombine2Job.create_node( workflow.analysis_time, sqliteCombine2Inputs) sqliteCombine2Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine2Node) sqliteCombine2Out = sqliteCombine2Node.output_files[0] sqliteCombine2Outs.append(sqliteCombine2Out) # Inj finding injFindJob = injFindExe(workflow.cp, injFindExeTag, ifo=workflow.ifo_string, out_dir=output_dir,tags=currTags) injFindNode = injFindJob.create_node(workflow.analysis_time, sqliteCombine2Out) injFindNode.add_profile('pegasus', 'label', job_label) workflow.add_node(injFindNode) injFindOut = injFindNode.output_files[0] injFindOuts.append(injFindOut) return injFindOuts, sqliteCombine1Outs, clusterCoincsOuts,\ sqliteCombine2Outs
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags( "workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable( workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' % (cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs
def rerank_coinc_followup(workflow, statmap_file, bank_file, out_dir, tags=None, injection_file=None, ranking_file=None): if tags is None: tags = [] make_analysis_dir(out_dir) if not workflow.cp.has_section("workflow-rerank"): logging.info("No reranking done in this workflow") return statmap_file else: logging.info("Setting up reranking of candidates") # Generate reduced data files (maybe this could also be used elsewhere?) stores = FileList([]) for ifo in workflow.ifos: make_analysis_dir('strain_files') node = Executable(workflow.cp, 'strain_data_reduce', ifos=[ifo], out_dir='strain_files', tags=tags).create_node() node.add_opt('--gps-start-time', workflow.analysis_time[0]) node.add_opt('--gps-end-time', workflow.analysis_time[1]) if injection_file: node.add_input_opt('--injection-file', injection_file) fil = node.new_output_file_opt(workflow.analysis_time, '.hdf', '--output-file') stores.append(fil) workflow += node # Generate trigger input file node = Executable(workflow.cp, 'rerank_trigger_input', ifos=workflow.ifos, out_dir=out_dir, tags=tags).create_node() node.add_input_opt('--statmap-file', statmap_file) node.add_input_opt('--bank-file', bank_file) trigfil = node.new_output_file_opt(workflow.analysis_time, '.hdf', '--output-file') workflow += node # Parallelize coinc trigger followup factor = int( workflow.cp.get_opt_tags("workflow-rerank", "parallelization-factor", tags)) exe = Executable(workflow.cp, 'coinc_followup', ifos=workflow.ifos, out_dir=out_dir, tags=tags) stat_files = FileList([]) for i in range(factor): node = exe.create_node() node.new_output_file_opt(workflow.analysis_time, '.hdf', '--output-file', tags=[str(i)]) node.add_multiifo_input_list_opt('--hdf-store', stores) node.add_input_opt('--input-file', trigfil) node.add_opt('--start-index', str(i)) node.add_opt('--stride', factor) workflow += node stat_files += node.output_files exe = Executable(workflow.cp, 'rerank_coincs', ifos=workflow.ifos, out_dir=out_dir, tags=tags) node = exe.create_node() node.add_input_list_opt('--stat-files', stat_files) node.add_input_opt('--statmap-file', statmap_file) node.add_input_opt('--followup-file', trigfil) if ranking_file: node.add_input_opt('--ranking-file', ranking_file) node.new_output_file_opt(workflow.analysis_time, '.hdf', '--output-file') workflow += node return node.output_file
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ prev_file = None prev_name = None this_name = None datafind_filelist = FileList([]) for cache in datafindcache_list: # sort the cache into time sequential order cache.sort() curr_ifo = cache.ifo for frame in cache: # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost','file://') # Create one File() object for each unique frame file that we # get back in the cache. if prev_file: prev_name = os.path.basename(prev_file.cache_entry.url) this_name = os.path.basename(frame.url) if (prev_file is None) or (prev_name != this_name): currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) datafind_filelist.append(currFile) prev_file = currFile # Populate the PFNs for the File() we just created if frame.url.startswith('file://'): currFile.PFN(frame.url, site='local') if frame.url.startswith( 'file:///cvmfs/oasis.opensciencegrid.org/ligo/frames'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. currFile.PFN(frame.url, site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'root://xrootd-local.unl.edu/user/'), site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://red-gridftp.unl.edu/user/'), site='osg') currFile.PFN(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://ldas-grid.ligo.caltech.edu/hdfs/'), site='osg') elif frame.url.startswith( 'file:///cvmfs/gwosc.osgstorage.org/'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. for s in ['osg', 'orangegrid', 'osgconnect']: currFile.PFN(frame.url, site=s) currFile.PFN(frame.url, site="{}-scratch".format(s)) else: currFile.PFN(frame.url, site='notlocal') return datafind_filelist
def setup_coincidence_workflow_ligolw_thinca(workflow, segsList, timeSlideFiles, inspiral_outs, output_dir, veto_cats=[2, 3, 4], tags=[], timeSlideTags=None, parallelize_split_input=False): """ This function is used to setup a single-stage ihope style coincidence stage of the workflow using ligolw_sstinca (or compatible code!). Parameters ----------- workflow : pycbc.workflow.core.Workflow The workflow instance that the coincidence jobs will be added to. segsList : pycbc.workflow.core.FileList The list of files returned by workflow's segment module that contains pointers to all the segment files generated in the workflow. If the coincidence code will be applying the data quality vetoes, then this will be used to ensure that the codes get the necessary input to do this. timeSlideFiles : pycbc.workflow.core.FileList An FileList of the timeSlide input files that are needed to determine what time sliding needs to be done. One of the timeSlideFiles will normally be "zero-lag only", the others containing time slides used to facilitate background computations later in the workflow. inspiral_outs : pycbc.workflow.core.FileList An FileList of the matched-filter module output that is used as input to the coincidence codes running at this stage. output_dir : path The directory in which coincidence output will be stored. veto_cats : list of ints (optional, default = [2,3,4]) Veto categories that will be applied in the coincidence jobs. If this takes the default value the code will run data quality at cumulative categories 2, 3 and 4. Note that if we change the flag definitions to be non-cumulative then this option will need to be revisited. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['BNSINJECTIONS'] or ['NOINJECTIONANALYSIS']. This will be used in output names. timeSlideTags : list of strings (optional, default = []) A list of the tags corresponding to the timeSlideFiles that are to be used in this call to the module. This can be used to ensure that the injection runs do no time sliding, but the no-injection runs do perform time slides (or vice-versa if you prefer!) Returns -------- ligolwThincaOuts : pycbc.workflow.core.FileList A list of the output files generated from ligolw_sstinca. ligolwAddOuts : pycbc.workflow.core.FileList A list of the output files generated from ligolw_add. """ from pylal import ligolw_cafe logging.debug("Entering coincidence module.") cp = workflow.cp ifoString = workflow.ifo_string # setup code for each veto_category coinc_outs = FileList([]) other_outs = {} if not timeSlideTags: # Get all sections by looking in ini file, use all time slide files. timeSlideTags = [(sec.split('-')[-1]).upper() for sec in workflow.cp.sections() if sec.startswith('tisi-')] if parallelize_split_input: # Want to split all input jobs according to their JOB%d tag. # This matches any string that is the letters JOB followed by some # numbers and nothing else. inspiral_outs_dict = {} regex_match = re.compile('JOB([0-9]+)\Z') for file in inspiral_outs: matches = [regex_match.match(tag) for tag in file.tags] # Remove non matching entries matches = [i for i in matches if i is not None] # Must have one entry if len(matches) == 0: warn_msg = "I was asked to parallelize over split inspiral " warn_msg += "files at the coincidence stage, but at least one " warn_msg += "input file does not have a JOB\%d tag indicating " warn_msg += "that it was split. Assuming that I do not have " warn_msg += "split input files and turning " warn_msg += "parallelize_split_input off." logging.warn(warn_msg) parallelize_split_input = False break if len(matches) > 1: err_msg = "One of my input files has two tags fitting JOB\%d " err_msg += "this means I cannot tell which split job this " err_msg += "file is from." raise ValueError(err_msg) # Extract the job ID id = int(matches[0].string[3:]) if not inspiral_outs_dict.has_key(id): inspiral_outs_dict[id] = FileList([]) inspiral_outs_dict[id].append(file) else: # If I got through all the files I want to sort the dictionaries so # that file with key a and index 3 is the same file as key b and # index 3 other than the tag is JOBA -> JOBB ... ie. it has used # a different part of the template bank. sort_lambda = lambda x: (x.ifo_string, x.segment, x. tagged_description) for key in inspiral_outs_dict.keys(): inspiral_outs_dict[id].sort(key=sort_lambda) # These should be in ascending order, so I can assume the existence # of a JOB0 tag inspiral_outs = inspiral_outs_dict[0] for index, file in enumerate(inspiral_outs): # Store the index in the file for quicker mapping later file.thinca_index = index else: inspiral_outs_dict = None for timeSlideTag in timeSlideTags: # Get the time slide file from the inputs tisiOutFile = timeSlideFiles.find_output_with_tag(timeSlideTag) if not len(tisiOutFile) == 1: errMsg = "If you are seeing this, something batshit is going on!" if len(tisiOutFile) == 0: errMsg = "No time slide files found matching %s." \ %(timeSlideTag) if len(tisiOutFile) > 1: errMsg = "More than one time slide files match %s." \ %(timeSlideTag) raise ValueError(errMsg) tisiOutFile = tisiOutFile[0] # Next we run ligolw_cafe. This is responsible for # identifying what times will be used for the ligolw_thinca jobs and # what files are needed for each. If doing time sliding there # will be some triggers read into multiple jobs cacheInspOuts = inspiral_outs.convert_to_lal_cache() if workflow.cp.has_option_tags("workflow-coincidence", "maximum-extent", tags): max_extent = float( workflow.cp.get_opt_tags("workflow-coincidence", "maximum-extent", tags)) else: # hard-coded default value for extent of time in a single job max_extent = 3600 logging.debug("Calling into cafe.") time_slide_table = lsctables.TimeSlideTable.get_table(\ ligolw_utils.load_filename(tisiOutFile.storage_path, gz=tisiOutFile.storage_path.endswith(".gz"), contenthandler=ContentHandler, verbose=False)) time_slide_table.sync_next_id() time_slide_dict = time_slide_table.as_dict() cafe_seglists, cafe_caches = ligolw_cafe.ligolw_cafe( cacheInspOuts, time_slide_dict.values(), extentlimit=max_extent, verbose=False) logging.debug("Done with cafe.") # Take the combined seglist file dqSegFile = segsList.find_output_with_tag( 'COMBINED_CUMULATIVE_SEGMENTS') if not len(dqSegFile) == 1: errMsg = "Did not find exactly 1 data quality file." print len(dqSegFile), dqSegFile raise ValueError(errMsg) dqSegFile = dqSegFile[0] # Set up llwadd job llwadd_tags = [timeSlideTag] + tags ligolwadd_job = LigolwAddExecutable(cp, 'llwadd', ifo=ifoString, out_dir=output_dir, tags=llwadd_tags) ligolwAddOuts = FileList([]) # Go global setup at each category # This flag will add a clustering job after ligolw_thinca if workflow.cp.has_option_tags("workflow-coincidence", "coincidence-post-cluster", llwadd_tags): coinc_post_cluster = True else: coinc_post_cluster = False # Go global setup at each category ligolwthinca_job = {} cluster_job = {} thinca_tags = {} for category in veto_cats: logging.debug("Preparing %s %s" % (timeSlideTag, category)) dqVetoName = 'VETO_CAT%d_CUMULATIVE' % (category) # FIXME: Should we resolve this now? # FIXME: Here we set the dqVetoName to be compatible with pipedown # For pipedown must put the slide identifier first and # dqVetoName last. pipedownDQVetoName = 'CAT_%d_VETO' % (category) curr_thinca_job_tags = [timeSlideTag] + tags + [pipedownDQVetoName] thinca_tags[category] = curr_thinca_job_tags # Set up jobs for ligolw_thinca ligolwthinca_job[category] = LigolwSSthincaExecutable( cp, 'thinca', ifo=ifoString, out_dir=output_dir, dqVetoName=dqVetoName, tags=curr_thinca_job_tags) if coinc_post_cluster: cluster_job[category] = SQLInOutExecutable( cp, 'pycbccluster', ifo=ifoString, out_dir=output_dir, tags=curr_thinca_job_tags) for idx, cafe_cache in enumerate(cafe_caches): ligolwAddOuts = FileList([]) ligolwThincaOuts = FileList([]) ligolwThincaLikelihoodOuts = FileList([]) ligolwClusterOuts = FileList([]) if not len(cafe_cache.objects): raise ValueError("One of the cache objects contains no files!") # Determine segments to accept coincidences. # If cache is not the first or last in the timeseries, check if the # two closes caches in the timeseries and see if their extent # match. If they match, they're adjacent and use the time where # they meet as a bound for accepting coincidences. If they're not # adjacent, then there is no bound for accepting coincidences. coincStart, coincEnd = None, None if idx and (cafe_cache.extent[0] == cafe_caches[idx - 1].extent[1]): coincStart = cafe_cache.extent[0] if idx + 1 - len(cafe_caches) and \ (cafe_cache.extent[1] == cafe_caches[idx+1].extent[0]): coincEnd = cafe_cache.extent[1] coincSegment = (coincStart, coincEnd) # Need to create a list of the File(s) contained in the cache. # Assume that if we have partitioned input then if *one* job in the # partitioned input is an input then *all* jobs will be. if not parallelize_split_input: inputTrigFiles = FileList([]) for object in cafe_cache.objects: inputTrigFiles.append(object.workflow_file) llw_files = inputTrigFiles + [dqSegFile] + [tisiOutFile] # Now we can create the nodes node = ligolwadd_job.create_node(cafe_cache.extent, llw_files) ligolwAddFile = node.output_files[0] ligolwAddOuts.append(ligolwAddFile) workflow.add_node(node) for category in veto_cats: node = ligolwthinca_job[category].create_node(\ cafe_cache.extent, coincSegment, ligolwAddFile) ligolwThincaOuts += \ node.output_files.find_output_without_tag('DIST_STATS') ligolwThincaLikelihoodOuts += \ node.output_files.find_output_with_tag('DIST_STATS') workflow.add_node(node) if coinc_post_cluster: node = cluster_job[category].create_node(\ cafe_cache.extent, ligolwThincaOuts[-1]) ligolwClusterOuts += node.output_files workflow.add_node(node) else: for key in inspiral_outs_dict.keys(): curr_tags = ["JOB%d" % (key)] curr_list = inspiral_outs_dict[key] inputTrigFiles = FileList([]) for object in cafe_cache.objects: inputTrigFiles.append( curr_list[object.workflow_file.thinca_index]) llw_files = inputTrigFiles + [dqSegFile] + [tisiOutFile] # Now we can create the nodes node = ligolwadd_job.create_node(cafe_cache.extent, llw_files, tags=curr_tags) ligolwAddFile = node.output_files[0] ligolwAddOuts.append(ligolwAddFile) workflow.add_node(node) if workflow.cp.has_option_tags( "workflow-coincidence", "coincidence-write-likelihood", curr_thinca_job_tags): write_likelihood = True else: write_likelihood = False for category in veto_cats: node = ligolwthinca_job[category].create_node(\ cafe_cache.extent, coincSegment, ligolwAddFile, tags=curr_tags, write_likelihood=write_likelihood) ligolwThincaOuts += \ node.output_files.find_output_without_tag(\ 'DIST_STATS') ligolwThincaLikelihoodOuts += \ node.output_files.find_output_with_tag(\ 'DIST_STATS') workflow.add_node(node) if coinc_post_cluster: node = cluster_job[category].create_node(\ cafe_cache.extent, ligolwThincaOuts[-1]) ligolwClusterOuts += node.output_files workflow.add_node(node) other_returns = {} other_returns['LIGOLW_ADD'] = ligolwAddOuts other_returns['DIST_STATS'] = ligolwThincaLikelihoodOuts if coinc_post_cluster: main_return = ligolwClusterOuts other_returns['THINCA'] = ligolwThincaOuts else: main_return = ligolwThincaOuts logging.debug("Done") coinc_outs.extend(main_return) for key, file_list in other_returns.items(): if other_outs.has_key(key): other_outs[key].extend(other_returns[key]) else: other_outs[key] = other_returns[key] return coinc_outs, other_outs
def setup_multiifo_interval_coinc_inj(workflow, hdfbank, full_data_trig_files, inj_trig_files, stat_files, veto_file, veto_name, out_dir, pivot_ifo, fixed_ifo, tags=None): """ This function sets up exact match multiifo coincidence for injections """ if tags is None: tags = [] make_analysis_dir(out_dir) logging.info('Setting up coincidence for injections') if len(hdfbank) != 1: raise ValueError('Must use exactly 1 bank file for this coincidence ' 'method, I got %i !' % len(hdfbank)) hdfbank = hdfbank[0] # Wall time knob and memory knob factor = int( workflow.cp.get_opt_tags('workflow-coincidence', 'parallelization-factor', tags)) ffiles = {} ifiles = {} for ifo, ffi in zip(*full_data_trig_files.categorize_by_attr('ifo')): ffiles[ifo] = ffi[0] for ifo, ifi in zip(*inj_trig_files.categorize_by_attr('ifo')): ifiles[ifo] = ifi[0] injinj_files = FileList() injfull_files = FileList() fullinj_files = FileList() # For the injfull and fullinj separation we take the pivot_ifo on one side, # and the rest that are attached to the fixed_ifo on the other side for ifo in ifiles: # ifiles is keyed on ifo if ifo == pivot_ifo: injinj_files.append(ifiles[ifo]) injfull_files.append(ifiles[ifo]) fullinj_files.append(ffiles[ifo]) else: injinj_files.append(ifiles[ifo]) injfull_files.append(ffiles[ifo]) fullinj_files.append(ifiles[ifo]) combo = [ (injinj_files, "injinj"), (injfull_files, "injfull"), (fullinj_files, "fullinj"), ] bg_files = {'injinj': [], 'injfull': [], 'fullinj': []} for trig_files, ctag in combo: findcoinc_exe = PyCBCFindMultiifoCoincExecutable(workflow.cp, 'multiifo_coinc', ifos=ifiles.keys(), tags=tags + [ctag], out_dir=out_dir) for i in range(factor): group_str = '%s/%s' % (i, factor) coinc_node = findcoinc_exe.create_node(trig_files, hdfbank, stat_files, veto_file, veto_name, group_str, pivot_ifo, fixed_ifo, tags=[veto_name, str(i)]) bg_files[ctag] += coinc_node.output_files workflow.add_node(coinc_node) logging.info('...leaving coincidence for injections') return bg_files
def convert_cachelist_to_filelist(datafindcache_list): """ Take as input a list of glue.lal.Cache objects and return a pycbc FileList containing all frames within those caches. Parameters ----------- datafindcache_list : list of glue.lal.Cache objects The list of cache files to convert. Returns -------- datafind_filelist : FileList of frame File objects The list of frame files. """ prev_file = None prev_name = None this_name = None datafind_filelist = FileList([]) for cache in datafindcache_list: # sort the cache into time sequential order cache.sort() curr_ifo = cache.ifo for frame in cache: # Pegasus doesn't like "localhost" in URLs. frame.url = frame.url.replace('file://localhost','file://') # Create one File() object for each unique frame file that we # get back in the cache. if prev_file: prev_name = os.path.basename(prev_file.cache_entry.url) this_name = os.path.basename(frame.url) if (prev_file is None) or (prev_name != this_name): currFile = File(curr_ifo, frame.description, frame.segment, file_url=frame.url, use_tmp_subdirs=True) datafind_filelist.append(currFile) prev_file = currFile # Populate the PFNs for the File() we just created if frame.url.startswith('file://'): currFile.add_pfn(frame.url, site='local') if frame.url.startswith( 'file:///cvmfs/oasis.opensciencegrid.org/ligo/frames'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. currFile.add_pfn(frame.url, site='osg') currFile.add_pfn(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'root://xrootd-local.unl.edu/user/'), site='osg') currFile.add_pfn(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://red-gridftp.unl.edu/user/'), site='osg') currFile.add_pfn(frame.url.replace( 'file:///cvmfs/oasis.opensciencegrid.org/', 'gsiftp://ldas-grid.ligo.caltech.edu/hdfs/'), site='osg') elif frame.url.startswith( 'file:///cvmfs/gwosc.osgstorage.org/'): # Datafind returned a URL valid on the osg as well # so add the additional PFNs to allow OSG access. for s in ['osg', 'orangegrid', 'osgconnect']: currFile.add_pfn(frame.url, site=s) currFile.add_pfn(frame.url, site="{}-scratch".format(s)) else: currFile.add_pfn(frame.url, site='notlocal') return datafind_filelist
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable(workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' %(cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs
def setup_multiifo_interval_coinc_inj(workflow, hdfbank, full_data_trig_files, inj_trig_files, stat_files, veto_file, veto_name, out_dir, pivot_ifo, fixed_ifo, tags=None): """ This function sets up exact match multiifo coincidence for injections """ if tags is None: tags = [] make_analysis_dir(out_dir) logging.info('Setting up coincidence for injections') if len(hdfbank) != 1: raise ValueError('Must use exactly 1 bank file for this coincidence ' 'method, I got %i !' % len(hdfbank)) hdfbank = hdfbank[0] # Wall time knob and memory knob factor = int(workflow.cp.get_opt_tags('workflow-coincidence', 'parallelization-factor', tags)) ffiles = {} ifiles = {} for ifo, ffi in zip(*full_data_trig_files.categorize_by_attr('ifo')): ffiles[ifo] = ffi[0] for ifo, ifi in zip(*inj_trig_files.categorize_by_attr('ifo')): ifiles[ifo] = ifi[0] injinj_files = FileList() injfull_files = FileList() fullinj_files = FileList() # For the injfull and fullinj separation we take the pivot_ifo on one side, # and the rest that are attached to the fixed_ifo on the other side for ifo in ifiles: # ifiles is keyed on ifo if ifo == pivot_ifo: injinj_files.append(ifiles[ifo]) injfull_files.append(ifiles[ifo]) fullinj_files.append(ffiles[ifo]) else: injinj_files.append(ifiles[ifo]) injfull_files.append(ffiles[ifo]) fullinj_files.append(ifiles[ifo]) combo = [(injinj_files, "injinj"), (injfull_files, "injfull"), (fullinj_files, "fullinj"), ] bg_files = {'injinj':[], 'injfull':[], 'fullinj':[]} for trig_files, ctag in combo: findcoinc_exe = PyCBCFindMultiifoCoincExecutable(workflow.cp, 'multiifo_coinc', ifos=ifiles.keys(), tags=tags + [ctag], out_dir=out_dir) for i in range(factor): group_str = '%s/%s' % (i, factor) coinc_node = findcoinc_exe.create_node(trig_files, hdfbank, stat_files, veto_file, veto_name, group_str, pivot_ifo, fixed_ifo, tags=[veto_name, str(i)]) bg_files[ctag] += coinc_node.output_files workflow.add_node(coinc_node) logging.info('...leaving coincidence for injections') return bg_files
def setup_injection_workflow(workflow, output_dir=None, inj_section_name='injections', tags =[]): """ This function is the gateway for setting up injection-generation jobs in a workflow. It should be possible for this function to support a number of different ways/codes that could be used for doing this, however as this will presumably stay as a single call to a single code (which need not be inspinj) there are currently no subfunctions in this moudle. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. output_dir : path The directory in which injection files will be stored. inj_section_name : string (optional, default='injections') The string that corresponds to the option describing the exe location in the [executables] section of the .ini file and that corresponds to the section (and sub-sections) giving the options that will be given to the code at run time. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. This will be used in output names. Returns -------- inj_files : pycbc.workflow.core.FileList The list of injection files created by this call. inj_tags : list of strings The tag corresponding to each injection file and used to uniquely identify them. The FileList class contains functions to search based on tags. """ logging.info("Entering injection module.") make_analysis_dir(output_dir) # Get full analysis segment for output file naming full_segment = workflow.analysis_time inj_tags = [] inj_files = FileList([]) for section in workflow.cp.get_subsections(inj_section_name): inj_tag = section.upper() curr_tags = tags + [inj_tag] # FIXME: Remove once fixed in pipedown # TEMPORARILY we require inj tags to end in "INJ" if not inj_tag.endswith("INJ"): err_msg = "Currently workflow requires injection names to end with " err_msg += "a inj suffix. Ie. bnslininj or bbhinj. " err_msg += "%s is not good." %(inj_tag.lower()) raise ValueError(err_msg) # Parse for options in ini file injection_method = workflow.cp.get_opt_tags("workflow-injections", "injections-method", curr_tags) if injection_method in ["IN_WORKFLOW", "AT_RUNTIME"]: # FIXME: Add ability to specify different exes inj_job = LalappsInspinjExecutable(workflow.cp, inj_section_name, tags=curr_tags, out_dir=output_dir, ifos='HL') node = inj_job.create_node(full_segment) if injection_method == "AT_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] elif injection_method == "PREGENERATED": injectionFilePath = workflow.cp.get_opt_tags("workflow-injections", "injections-pregenerated-file", curr_tags) file_url = urlparse.urljoin('file:', urllib.pathname2url(injectionFilePath)) inj_file = File('HL', 'PREGEN_inj_file', full_segment, file_url, tags=curr_tags) inj_file.PFN(injectionFilePath, site='local') else: err = "Injection method must be one of IN_WORKFLOW, " err += "AT_RUNTIME or PREGENERATED. Got %s." % (injection_method) raise ValueError(err) inj_files.append(inj_file) inj_tags.append(inj_tag) logging.info("Leaving injection module.") return inj_files, inj_tags
def create_node(self, trig_files=None, segment_dir=None, analysis_seg=None, slide_tag=None, out_tags=None, tags=None): import Pegasus.DAX3 as dax if out_tags is None: out_tags = [] if tags is None: tags = [] node = Node(self) if not trig_files: raise ValueError("%s must be supplied with trigger files" % self.name) # Data options num_trials = int(self.cp.get("trig_combiner", "num-trials")) trig_name = self.cp.get('workflow', 'trigger-name') if all("COHERENT_NO_INJECTIONS" in t.name for t in trig_files) and \ self.cp.has_option_tag('inspiral', 'do-short-slides', 'coherent_no_injections'): node.add_opt('--short-slides') node.add_opt('--grb-name', trig_name) node.add_opt('--trig-start-time', analysis_seg[0]) node.add_opt('--ifo-tag', self.ifos) node.add_opt('--user-tag', 'INSPIRAL') if tags: node.add_opt('--job-tag', '_'.join(tags)) if slide_tag is not None: node.add_opt('--slide-tag', slide_tag) node.add_opt('--long-slides') tag_start=["TIMESLIDES_GRB%s_%s" % (trig_name, slide_tag)]+tags else: tag_start=["GRB%s" % trig_name]+tags # Set input / output options if all(hasattr(t.node, "executable") for t in trig_files): if all(t.node.executable.name == "trig_cluster" for t in trig_files): node.add_opt('--input-files', " ".join([t.storage_path for t in trig_files])) if self.cp.has_option_tag('inspiral', 'do-short-slides', 'coherent_no_injections'): node.add_opt('--short-slides') else: node.add_input_list_opt('--input-files', trig_files) else: node.add_opt('--input-files', " ".join([t.storage_path for t in trig_files])) node.add_opt('--segment-dir', segment_dir) node.add_opt('--output-dir', self.out_dir) out_files = FileList([]) for out_tag in out_tags: out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=tag_start+[out_tag], store_file=self.retain_files) out_files.append(out_file) #node._dax_node.uses(out_file, link=dax.Link.OUTPUT, register=False, # transfer=False) #node._outputs += [out_file] #out_file.node = node #node._add_output(out_file) for trial in range(1, num_trials + 1): out_file = File(self.ifos, 'INSPIRAL', trig_files[0].segment, directory=self.out_dir, extension='xml.gz', tags=tag_start+["OFFTRIAL_%d" % trial], store_file=self.retain_files) out_files.append(out_file) #node._dax_node.uses(out_file, link=dax.Link.OUTPUT, register=False, # transfer=False) #node._outputs += [out_file] #out_file.node = node #node._add_output(out_file) node.add_profile('condor', 'request_cpus', self.num_threads) return node, out_files
def setup_coincidence_workflow_ligolw_thinca( workflow, segsList, timeSlideFiles, inspiral_outs, output_dir, veto_cats=[2,3,4], tags=[], timeSlideTags=None, parallelize_split_input=False): """ This function is used to setup a single-stage ihope style coincidence stage of the workflow using ligolw_sstinca (or compatible code!). Parameters ----------- workflow : pycbc.workflow.core.Workflow The workflow instance that the coincidence jobs will be added to. segsList : pycbc.workflow.core.FileList The list of files returned by workflow's segment module that contains pointers to all the segment files generated in the workflow. If the coincidence code will be applying the data quality vetoes, then this will be used to ensure that the codes get the necessary input to do this. timeSlideFiles : pycbc.workflow.core.FileList An FileList of the timeSlide input files that are needed to determine what time sliding needs to be done. One of the timeSlideFiles will normally be "zero-lag only", the others containing time slides used to facilitate background computations later in the workflow. inspiral_outs : pycbc.workflow.core.FileList An FileList of the matched-filter module output that is used as input to the coincidence codes running at this stage. output_dir : path The directory in which coincidence output will be stored. veto_cats : list of ints (optional, default = [2,3,4]) Veto categories that will be applied in the coincidence jobs. If this takes the default value the code will run data quality at cumulative categories 2, 3 and 4. Note that if we change the flag definitions to be non-cumulative then this option will need to be revisited. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['BNSINJECTIONS'] or ['NOINJECTIONANALYSIS']. This will be used in output names. timeSlideTags : list of strings (optional, default = []) A list of the tags corresponding to the timeSlideFiles that are to be used in this call to the module. This can be used to ensure that the injection runs do no time sliding, but the no-injection runs do perform time slides (or vice-versa if you prefer!) Returns -------- ligolwThincaOuts : pycbc.workflow.core.FileList A list of the output files generated from ligolw_sstinca. ligolwAddOuts : pycbc.workflow.core.FileList A list of the output files generated from ligolw_add. """ from pylal import ligolw_cafe logging.debug("Entering coincidence module.") cp = workflow.cp ifoString = workflow.ifo_string # setup code for each veto_category coinc_outs = FileList([]) other_outs = {} if not timeSlideTags: # Get all sections by looking in ini file, use all time slide files. timeSlideTags = [(sec.split('-')[-1]).upper() for sec in workflow.cp.sections() if sec.startswith('tisi-')] if parallelize_split_input: # Want to split all input jobs according to their JOB%d tag. # This matches any string that is the letters JOB followed by some # numbers and nothing else. inspiral_outs_dict = {} regex_match = re.compile('JOB([0-9]+)\Z') for file in inspiral_outs: matches = [regex_match.match(tag) for tag in file.tags] # Remove non matching entries matches = [i for i in matches if i is not None] # Must have one entry if len(matches) == 0: warn_msg = "I was asked to parallelize over split inspiral " warn_msg += "files at the coincidence stage, but at least one " warn_msg += "input file does not have a JOB\%d tag indicating " warn_msg += "that it was split. Assuming that I do not have " warn_msg += "split input files and turning " warn_msg += "parallelize_split_input off." logging.warn(warn_msg) parallelize_split_input = False break if len(matches) > 1: err_msg = "One of my input files has two tags fitting JOB\%d " err_msg += "this means I cannot tell which split job this " err_msg += "file is from." raise ValueError(err_msg) # Extract the job ID id = int(matches[0].string[3:]) if not inspiral_outs_dict.has_key(id): inspiral_outs_dict[id] = FileList([]) inspiral_outs_dict[id].append(file) else: # If I got through all the files I want to sort the dictionaries so # that file with key a and index 3 is the same file as key b and # index 3 other than the tag is JOBA -> JOBB ... ie. it has used # a different part of the template bank. sort_lambda = lambda x: (x.ifo_string, x.segment, x.tagged_description) for key in inspiral_outs_dict.keys(): inspiral_outs_dict[id].sort(key = sort_lambda) # These should be in ascending order, so I can assume the existence # of a JOB0 tag inspiral_outs = inspiral_outs_dict[0] for index, file in enumerate(inspiral_outs): # Store the index in the file for quicker mapping later file.thinca_index = index else: inspiral_outs_dict = None for timeSlideTag in timeSlideTags: # Get the time slide file from the inputs tisiOutFile = timeSlideFiles.find_output_with_tag(timeSlideTag) if not len(tisiOutFile) == 1: errMsg = "If you are seeing this, something batshit is going on!" if len(tisiOutFile) == 0: errMsg = "No time slide files found matching %s." \ %(timeSlideTag) if len(tisiOutFile) > 1: errMsg = "More than one time slide files match %s." \ %(timeSlideTag) raise ValueError(errMsg) tisiOutFile = tisiOutFile[0] # Next we run ligolw_cafe. This is responsible for # identifying what times will be used for the ligolw_thinca jobs and # what files are needed for each. If doing time sliding there # will be some triggers read into multiple jobs cacheInspOuts = inspiral_outs.convert_to_lal_cache() if workflow.cp.has_option_tags("workflow-coincidence", "maximum-extent", tags): max_extent = float( workflow.cp.get_opt_tags( "workflow-coincidence", "maximum-extent", tags) ) else: # hard-coded default value for extent of time in a single job max_extent = 3600 logging.debug("Calling into cafe.") time_slide_table = lsctables.TimeSlideTable.get_table(\ ligolw_utils.load_filename(tisiOutFile.storage_path, gz=tisiOutFile.storage_path.endswith(".gz"), contenthandler=ContentHandler, verbose=False)) time_slide_table.sync_next_id() time_slide_dict = time_slide_table.as_dict() cafe_seglists, cafe_caches = ligolw_cafe.ligolw_cafe(cacheInspOuts, time_slide_dict.values(), extentlimit=max_extent, verbose=False) logging.debug("Done with cafe.") # Take the combined seglist file dqSegFile=segsList.find_output_with_tag('COMBINED_CUMULATIVE_SEGMENTS') if not len(dqSegFile) == 1: errMsg = "Did not find exactly 1 data quality file." print len(dqSegFile), dqSegFile raise ValueError(errMsg) dqSegFile=dqSegFile[0] # Set up llwadd job llwadd_tags = [timeSlideTag] + tags ligolwadd_job = LigolwAddExecutable(cp, 'llwadd', ifo=ifoString, out_dir=output_dir, tags=llwadd_tags) ligolwAddOuts = FileList([]) # Go global setup at each category # This flag will add a clustering job after ligolw_thinca if workflow.cp.has_option_tags("workflow-coincidence", "coincidence-post-cluster", llwadd_tags): coinc_post_cluster = True else: coinc_post_cluster = False # Go global setup at each category ligolwthinca_job = {} cluster_job = {} thinca_tags = {} for category in veto_cats: logging.debug("Preparing %s %s" %(timeSlideTag,category)) dqVetoName = 'VETO_CAT%d_CUMULATIVE' %(category) # FIXME: Should we resolve this now? # FIXME: Here we set the dqVetoName to be compatible with pipedown # For pipedown must put the slide identifier first and # dqVetoName last. pipedownDQVetoName = 'CAT_%d_VETO' %(category) curr_thinca_job_tags = [timeSlideTag] + tags + [pipedownDQVetoName] thinca_tags[category]=curr_thinca_job_tags # Set up jobs for ligolw_thinca ligolwthinca_job[category] = LigolwSSthincaExecutable(cp, 'thinca', ifo=ifoString, out_dir=output_dir, dqVetoName=dqVetoName, tags=curr_thinca_job_tags) if coinc_post_cluster: cluster_job[category] = SQLInOutExecutable(cp, 'pycbccluster', ifo=ifoString, out_dir=output_dir, tags=curr_thinca_job_tags) for idx, cafe_cache in enumerate(cafe_caches): ligolwAddOuts = FileList([]) ligolwThincaOuts = FileList([]) ligolwThincaLikelihoodOuts = FileList([]) ligolwClusterOuts = FileList([]) if not len(cafe_cache.objects): raise ValueError("One of the cache objects contains no files!") # Determine segments to accept coincidences. # If cache is not the first or last in the timeseries, check if the # two closes caches in the timeseries and see if their extent # match. If they match, they're adjacent and use the time where # they meet as a bound for accepting coincidences. If they're not # adjacent, then there is no bound for accepting coincidences. coincStart, coincEnd = None, None if idx and (cafe_cache.extent[0] == cafe_caches[idx-1].extent[1]): coincStart = cafe_cache.extent[0] if idx + 1 - len(cafe_caches) and \ (cafe_cache.extent[1] == cafe_caches[idx+1].extent[0]): coincEnd = cafe_cache.extent[1] coincSegment = (coincStart, coincEnd) # Need to create a list of the File(s) contained in the cache. # Assume that if we have partitioned input then if *one* job in the # partitioned input is an input then *all* jobs will be. if not parallelize_split_input: inputTrigFiles = FileList([]) for object in cafe_cache.objects: inputTrigFiles.append(object.workflow_file) llw_files = inputTrigFiles + [dqSegFile] + [tisiOutFile] # Now we can create the nodes node = ligolwadd_job.create_node(cafe_cache.extent, llw_files) ligolwAddFile = node.output_files[0] ligolwAddOuts.append(ligolwAddFile) workflow.add_node(node) for category in veto_cats: node = ligolwthinca_job[category].create_node(\ cafe_cache.extent, coincSegment, ligolwAddFile) ligolwThincaOuts += \ node.output_files.find_output_without_tag('DIST_STATS') ligolwThincaLikelihoodOuts += \ node.output_files.find_output_with_tag('DIST_STATS') workflow.add_node(node) if coinc_post_cluster: node = cluster_job[category].create_node(\ cafe_cache.extent, ligolwThincaOuts[-1]) ligolwClusterOuts += node.output_files workflow.add_node(node) else: for key in inspiral_outs_dict.keys(): curr_tags = ["JOB%d" %(key)] curr_list = inspiral_outs_dict[key] inputTrigFiles = FileList([]) for object in cafe_cache.objects: inputTrigFiles.append( curr_list[object.workflow_file.thinca_index]) llw_files = inputTrigFiles + [dqSegFile] + [tisiOutFile] # Now we can create the nodes node = ligolwadd_job.create_node(cafe_cache.extent, llw_files, tags=curr_tags) ligolwAddFile = node.output_files[0] ligolwAddOuts.append(ligolwAddFile) workflow.add_node(node) if workflow.cp.has_option_tags("workflow-coincidence", "coincidence-write-likelihood",curr_thinca_job_tags): write_likelihood=True else: write_likelihood=False for category in veto_cats: node = ligolwthinca_job[category].create_node(\ cafe_cache.extent, coincSegment, ligolwAddFile, tags=curr_tags, write_likelihood=write_likelihood) ligolwThincaOuts += \ node.output_files.find_output_without_tag(\ 'DIST_STATS') ligolwThincaLikelihoodOuts += \ node.output_files.find_output_with_tag(\ 'DIST_STATS') workflow.add_node(node) if coinc_post_cluster: node = cluster_job[category].create_node(\ cafe_cache.extent, ligolwThincaOuts[-1]) ligolwClusterOuts += node.output_files workflow.add_node(node) other_returns = {} other_returns['LIGOLW_ADD'] = ligolwAddOuts other_returns['DIST_STATS'] = ligolwThincaLikelihoodOuts if coinc_post_cluster: main_return = ligolwClusterOuts other_returns['THINCA'] = ligolwThincaOuts else: main_return = ligolwThincaOuts logging.debug("Done") coinc_outs.extend(main_return) for key, file_list in other_returns.items(): if other_outs.has_key(key): other_outs[key].extend(other_returns[key]) else: other_outs[key] = other_returns[key] return coinc_outs, other_outs
def setup_postprocprep_gstlal_workflow(workflow, coinc_files, output_dir, tags=[], injection_files=None, veto_files=None, inj_less_tag=None, injection_tags=[], veto_cat=None, summary_xml_files=None, likelihood_files=[]): """ Parameters ----------- workflow : workflow.Workflow The workflow instance that the coincidence jobs will be added to. coinc_files : workflow.FileList An FileList of the coincident trigger files that are used as input at this stage. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. injection_files : workflow.FileList (optional, default=None) The injection files to be used in this stage. An empty list (or any other input that evaluates as false) is valid and will imply that no injections are being done. veto_files : workflow.FileList (required) The data quality files to be used in this stage. This is required and will be used to determine the analysed times when doing post-processing. inj_less_tag : string (required) The tag that identifies files that do not have simulations in them. Ie. the primary search results. injection_tags : list of strings (optional, default = []) Each injection file has a unique tag. If used in the method, this tells the post-processing preparation code which injection tags it should include when creating the combined output. veto_cat : int (optional, default = None) FIXME: How does gstlal deal with veto categories? Hardcode to CAT1 for now. summary_xml_files : workflow.FileList An FileList of the output of the analysislogging_utils module. Here, this will be one file that includes the segments analysed by the workflow. Returns -------- finalFiles : workflow.FileList A list of the single SQL database storing the clustered, injection found, triggers for all injections, time slid and zero lag analyses. initialSqlFiles : workflow.FileList The SQL files before clustering is applied and injection finding performed. clusteredSqlFiles : workflow.FileList The clustered SQL files before injection finding performed. combinedSqlFiles : workflow.FileList A combined file containing all triggers after clustering, including the injection and veto tables, but before injection finding performed. Probably there is no need to ever keep this file and it will be a temporary file in most cases. """ # Sanity checks if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes run_sqlite_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-runsqlite-exe", tags) ligolw_sqlite_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-ligolwsqlite-exe", tags) inspinjfind_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-inspinjfind-exe", tags) sql_to_xml_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-sqltoxml-exe", tags) pycbc_picklehor_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-picklehor-exe", tags) pycbc_combllhood_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combllhood-exe", tags) pycbc_genranking_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-genranking-exe", tags) pycbc_compllhood_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-compllhood-exe", tags) marg_likelihood_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-marglikelihood-exe", tags) far_gstlal_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-fargstlal-exe", tags) plot_summary_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotsummary-exe", tags) plot_sensitivity_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotsensitivity-exe", tags) plot_background_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotbackground-exe", tags) summary_page_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-summarypage-exe", tags) run_sqlite_exe = select_generic_executable(workflow, run_sqlite_exe_name) ligolw_sqlite_exe = select_generic_executable(workflow, ligolw_sqlite_exe_name) inspinjfind_exe = select_generic_executable(workflow, inspinjfind_exe_name) sql_to_xml_exe = select_generic_executable(workflow, sql_to_xml_exe_name) pycbc_picklehor_exe = select_generic_executable(workflow, pycbc_picklehor_exe_name) pycbc_combllhood_exe = select_generic_executable(workflow, pycbc_combllhood_exe_name) pycbc_genranking_exe = select_generic_executable(workflow, pycbc_genranking_exe_name) pycbc_compllhood_exe = select_generic_executable(workflow, pycbc_compllhood_exe_name) marg_likelihood_exe = select_generic_executable(workflow, marg_likelihood_exe_name) far_gstlal_exe = select_generic_executable(workflow, far_gstlal_exe_name) plot_summary_exe = select_generic_executable(workflow, plot_summary_exe_name) plot_sensitivity_exe = select_generic_executable(workflow, plot_sensitivity_exe_name) plot_background_exe = select_generic_executable(workflow, plot_background_exe_name) summary_page_exe = select_generic_executable(workflow, summary_page_exe_name) # SETUP # FIXME: Some hacking is still needed while we support pipedown # FIXME: How does gstlal deal with veto categories? # Hardcode to CAT1 for now. veto_tag = 'CUMULATIVE_CAT_%d' %(veto_cat,) dq_seg_file = veto_files.find_output_with_tag(veto_tag) assert len(dq_seg_file) == 1 dq_seg_file = dq_seg_file[0] #if not len(dqSegFile) == 1: # errMsg = "Did not find exactly 1 data quality file." # raise ValueError(errMsg) # FIXME: Here we set the dqVetoName to be compatible with pipedown pipedown_dq_veto_name = 'CAT_%d_VETO' %(veto_cat,) # First we need to covert to SQL, this is STAGE0 # Do for all injection runs and zero lag stage0_outputs = {} for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] trig_veto_inp_files = \ coinc_files.find_output_with_tag(pipedown_dq_veto_name) trig_inp_files = trig_veto_inp_files.find_output_with_tag(inj_tag) stage0_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE0'] + curr_tags) stage0_outputs[inj_tag] = FileList([]) assert len(trig_inp_files) > 0 for file in trig_inp_files: stage0_node = stage0_job.create_node(file.segment, [file]) workflow.add_node(stage0_node) # Node has only one output file stage0_out = stage0_node.output_files[0] stage0_outputs[inj_tag].append(stage0_out) curr_tags = tags + [veto_tag] # NOW WE DO LIKELIHOOD SETUP pycbc_picklehor_job = pycbc_picklehor_exe(workflow.cp, pycbc_picklehor_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) pycbc_combllhood_job = pycbc_combllhood_exe(workflow.cp, pycbc_combllhood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) pycbc_genranking_job = pycbc_genranking_exe(workflow.cp, pycbc_genranking_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) marg_likelihood_job_1 = marg_likelihood_exe(workflow.cp, marg_likelihood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['MARG1']+curr_tags) marg_likelihood_job_2 = marg_likelihood_exe(workflow.cp, marg_likelihood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['MARG2']+curr_tags) # Begin with finding the horizon distances picklehor_inputs = stage0_outputs[inj_less_tag] node = pycbc_picklehor_job.create_node(workflow.analysis_time, picklehor_inputs) workflow.add_node(node) horizon_dist_file = node.output_files[0] # Then combine all likelihood files combllhood_inputs = likelihood_files.find_output_with_tag(\ pipedown_dq_veto_name) combllhood_inputs = combllhood_inputs.find_output_with_tag(inj_less_tag) assert len(combllhood_inputs) > 0 node = pycbc_combllhood_job.create_node(workflow.analysis_time, combllhood_inputs, horizon_dist_file) workflow.add_node(node) likelihood_file = node.output_files[0] # Also compute the ranking file node = pycbc_genranking_job.create_node(workflow.analysis_time, likelihood_file, horizon_dist_file) workflow.add_node(node) ranking_likelihood_file = node.output_files[0] # And marginalize (twice for some reason!) node = marg_likelihood_job_1.create_node(workflow.analysis_time, ranking_likelihood_file) workflow.add_node(node) marg_likelihood_file_1 = node.output_files[0] node = marg_likelihood_job_2.create_node(workflow.analysis_time, marg_likelihood_file_1) workflow.add_node(node) marg_likelihood_file_2 = node.output_files[0] # Now do the sqlite conditioning. This has a few stages. # STAGE 1: Populate likelihood in all input files # STAGE 2: Run run_sqlite on all outputs of stage 1 # STAGE 3: Combine all files into one sqlite file # STAGE 4: Run run_sqlite on outputs of stage 3 # STAGE 5: Add segments.xml and inj.xml # STAGE 6: Run run_sqlite (cluster an simplify) on outputs of stage 5 # STAGE 7: Dump SQL database to xml # STAGE 8: Run injfind on the xml document # STAGE 9: Convert back to SQL stage1_outputs = {} stage2_outputs = {} stage3_outputs = {} stage4_outputs = {} stage5_outputs = {} stage6_outputs = {} stage7_outputs = {} stage8_outputs = {} stage9_outputs = {} final_outputs = FileList([]) # Do for all injection runs and zero lag for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] trig_inp_files = stage0_outputs[inj_tag] stage1_job = pycbc_compllhood_exe(workflow.cp, pycbc_compllhood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE1']+curr_tags) stage2_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE2'] + curr_tags) stage3_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE3'] + curr_tags) stage4_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE4'] + curr_tags) stage5_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE5'] + curr_tags) if inj_tag == inj_less_tag: # For zero-lag we stop here, so use the FINAL tag to indicate this stage6_zl_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['FINAL'] + curr_tags) else: stage6_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE6'] + curr_tags) stage7_job = sql_to_xml_exe(workflow.cp, sql_to_xml_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE7'] + curr_tags) stage8_job = inspinjfind_exe(workflow.cp, inspinjfind_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE8'] + curr_tags) stage9_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['FINAL'] + curr_tags) stage1_outputs[inj_tag] = FileList([]) stage2_outputs[inj_tag] = FileList([]) assert len(trig_inp_files) > 0 for file in trig_inp_files: stage1_node = stage1_job.create_node(file.segment, file, likelihood_file, horizon_dist_file) workflow.add_node(stage1_node) # Node has only one output file stage1_out = stage1_node.output_files[0] stage1_outputs[inj_tag].append(stage1_out) stage2_node = stage2_job.create_node(stage1_out.segment, stage1_out) workflow.add_node(stage2_node) # Node has only one output file stage2_out = stage2_node.output_files[0] stage2_outputs[inj_tag].append(stage2_out) stage3_node = stage3_job.create_node(workflow.analysis_time, stage2_outputs[inj_tag], workflow=workflow) workflow.add_node(stage3_node) # Node has only one output file stage3_out = stage3_node.output_files[0] stage3_outputs[inj_tag] = stage3_out stage4_node = stage4_job.create_node(workflow.analysis_time, stage3_out) workflow.add_node(stage4_node) # Node has only one output file stage4_out = stage4_node.output_files[0] stage4_outputs[inj_tag] = stage4_out stage5_inputs = [stage4_out] stage5_inputs.append(summary_xml_files[0]) stage5_inputs.append(dq_seg_file) if inj_tag != inj_less_tag: inj_file = injection_files.find_output_with_tag(inj_tag) assert (len(inj_file) == 1) stage5_inputs.append(inj_file[0]) stage5_node = stage5_job.create_node(workflow.analysis_time, stage5_inputs) workflow.add_node(stage5_node) # Node has only one output file stage5_out = stage5_node.output_files[0] stage5_outputs[inj_tag] = stage5_out if inj_tag == inj_less_tag: stage6_node = stage6_zl_job.create_node(workflow.analysis_time, stage5_out) workflow.add_node(stage6_node) stage6_out = stage6_node.output_files[0] stage6_outputs[inj_tag] = stage6_out final_outputs.append(stage6_out) else: stage6_node = stage6_job.create_node(workflow.analysis_time, stage5_out) workflow.add_node(stage6_node) stage6_out = stage6_node.output_files[0] stage6_outputs[inj_tag] = stage6_out stage7_node = stage7_job.create_node(workflow.analysis_time, stage6_out) workflow.add_node(stage7_node) stage7_out = stage7_node.output_files[0] stage7_outputs[inj_tag] = stage7_out stage8_node = stage8_job.create_node(workflow.analysis_time, stage7_out) workflow.add_node(stage8_node) stage8_out = stage8_node.output_files[0] stage8_outputs[inj_tag] = stage8_out stage9_node = stage9_job.create_node(workflow.analysis_time, [stage8_out]) workflow.add_node(stage9_node) stage9_out = stage9_node.output_files[0] stage9_outputs[inj_tag] = stage9_out final_outputs.append(stage9_out) # Next we run the compute FAR from snr_chisq histograms job far_gstlal_outputs = {} for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] far_gstlal_job = far_gstlal_exe(workflow.cp, far_gstlal_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) trig_veto_inp_files = \ final_outputs.find_output_with_tag(veto_tag) trig_inp_files = trig_veto_inp_files.find_output_with_tag(inj_tag) assert len(trig_inp_files) == 1 input_database = trig_inp_files[0] if inj_tag != inj_less_tag: no_inj_db = trig_veto_inp_files.find_output_with_tag(inj_less_tag) assert len(no_inj_db) == 1 no_inj_db = no_inj_db[0] write_background = False else: # Here I don't want to provide the same file as a dependancy # twice. Therefore I just give non-injection DB and the code # assumes this is also the input-database if it is not given. # Also, I only want the background file once no_inj_db = input_database input_database = None write_background = True far_gstlal_node = far_gstlal_job.create_node(workflow.analysis_time, no_inj_db, marg_likelihood_file_2, inj_database=input_database, write_background_bins=write_background) workflow.add_node(far_gstlal_node) outputs = far_gstlal_node.output_files if inj_tag != inj_less_tag: assert len(outputs) == 1 far_gstlal_outputs[inj_tag] = outputs[0] else: assert len(outputs) == 2 sql_out = outputs.find_output_without_tag('POSTMARG')[0] xml_out = outputs.find_output_with_tag('POSTMARG')[0] far_gstlal_outputs[inj_tag] = sql_out post_marginalized_file = xml_out # Finally some plotting. # FIXME: These are given explicit output directories and pegasus does not # know about output files. Would be nice if this was done "better" curr_tags = tags + [veto_tag] plot_summary_job = plot_summary_exe(workflow.cp, plot_summary_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) plot_sensitivity_job = plot_sensitivity_exe(workflow.cp, plot_sensitivity_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) plot_background_job = plot_background_exe(workflow.cp, plot_background_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) inj_dbs = [] for inj_tag in injection_tags: inj_dbs.append(far_gstlal_outputs[inj_tag]) non_inj_db = far_gstlal_outputs[inj_less_tag] plot_summary_node = plot_summary_job.create_node(non_inj_db, inj_dbs) plot_background_node = plot_background_job.create_node(non_inj_db, post_marginalized_file) plot_sensitivity_node = plot_sensitivity_job.create_node(non_inj_db, inj_dbs) workflow.add_node(plot_summary_node) workflow.add_node(plot_background_node) workflow.add_node(plot_sensitivity_node) # And make the html pages parents = [plot_summary_node, plot_background_node, plot_sensitivity_node] closed_summarypage_job = summary_page_exe(workflow.cp, summary_page_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['CLOSEDBOX'] + curr_tags) open_summarypage_job = summary_page_exe(workflow.cp, summary_page_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['OPENBOX'] + curr_tags) closed_summarypage_node = closed_summarypage_job.create_and_add_node(\ workflow, parents) open_summarypage_node = open_summarypage_job.create_and_add_node(workflow, parents) # FIXME: Maybe contatenate and return all other outputs if needed elsewhere # FIXME: Move to pp utils and return the FAR files. return final_outputs
def setup_timeslides_workflow(workflow, output_dir=None, tags=[], timeSlideSectionName='ligolw_tisi'): ''' Setup generation of time_slide input files in the workflow. Currently used only with ligolw_tisi to generate files containing the list of slides to be performed in each time slide job. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. This will be used in output names. timeSlideSectionName : string (optional, default='injections') The string that corresponds to the option describing the exe location in the [executables] section of the .ini file and that corresponds to the section (and sub-sections) giving the options that will be given to the code at run time. Returns -------- timeSlideOuts : pycbc.workflow.core.FileList The list of time slide files created by this call. ''' logging.info("Entering time slides setup module.") make_analysis_dir(output_dir) # Get ifo list and full analysis segment for output file naming ifoList = workflow.ifos ifo_string = workflow.ifo_string fullSegment = workflow.analysis_time # Identify which time-slides to do by presence of sub-sections in the # configuration file all_sec = workflow.cp.sections() timeSlideSections = [sec for sec in all_sec if sec.startswith('tisi-')] timeSlideTags = [(sec.split('-')[-1]).upper() for sec in timeSlideSections] timeSlideOuts = FileList([]) # FIXME: Add ability to specify different exes # Make the timeSlideFiles for timeSlideTag in timeSlideTags: currTags = tags + [timeSlideTag] timeSlideMethod = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-method", currTags) if timeSlideMethod in ["IN_WORKFLOW", "AT_RUNTIME"]: timeSlideExeTag = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-exe", currTags) timeSlideExe = select_generic_executable(workflow, timeSlideExeTag) timeSlideJob = timeSlideExe(workflow.cp, timeSlideExeTag, ifos=ifo_string, tags=currTags, out_dir=output_dir) timeSlideNode = timeSlideJob.create_node(fullSegment) if timeSlideMethod == "AT_RUNTIME": workflow.execute_node(timeSlideNode) else: workflow.add_node(timeSlideNode) tisiOutFile = timeSlideNode.output_files[0] elif timeSlideMethod == "PREGENERATED": timeSlideFilePath = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-pregenerated-file", currTags) file_url = urlparse.urljoin('file:', urllib.pathname2url(\ timeSlideFilePath)) tisiOutFile = File(ifoString, 'PREGEN_TIMESLIDES', fullSegment, file_url, tags=currTags) timeSlideOuts.append(tisiOutFile) return timeSlideOuts
def setup_tmpltbank_pregenerated(workflow, tags=None): ''' Setup CBC workflow to use a pregenerated template bank. The bank given in cp.get('workflow','pregenerated-template-bank') will be used as the input file for all matched-filtering jobs. If this option is present, workflow will assume that it should be used and not generate template banks within the workflow. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- tmplt_banks : pycbc.workflow.core.FileList The FileList holding the details of the template bank. ''' if tags is None: tags = [] # Currently this uses the *same* fixed bank for all ifos. # Maybe we want to add capability to analyse separate banks in all ifos? # Set up class for holding the banks tmplt_banks = FileList([]) cp = workflow.cp global_seg = workflow.analysis_time user_tag = "PREGEN_TMPLTBANK" try: # First check if we have a bank for all ifos pre_gen_bank = cp.get_opt_tags('workflow-tmpltbank', 'tmpltbank-pregenerated-bank', tags) pre_gen_bank = resolve_url(pre_gen_bank) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_bank)) curr_file = File(workflow.ifos, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') tmplt_banks.append(curr_file) except ConfigParser.Error: # Okay then I must have banks for each ifo for ifo in workflow.ifos: try: pre_gen_bank = cp.get_opt_tags('workflow-tmpltbank', 'tmpltbank-pregenerated-bank-%s' % ifo.lower(), tags) pre_gen_bank = resolve_url(pre_gen_bank) file_url = urlparse.urljoin('file:', urllib.pathname2url(pre_gen_bank)) curr_file = File(ifo, user_tag, global_seg, file_url, tags=tags) curr_file.PFN(file_url, site='local') tmplt_banks.append(curr_file) except ConfigParser.Error: err_msg = "Cannot find pregerated template bank in section " err_msg += "[workflow-tmpltbank] or any tagged sections. " if tags: tagged_secs = " ".join("[workflow-tmpltbank-%s]" \ %(ifo,) for ifo in workflow.ifos) err_msg += "Tagged sections are %s. " %(tagged_secs,) err_msg += "I looked for 'tmpltbank-pregenerated-bank' option " err_msg += "and 'tmpltbank-pregenerated-bank-%s'." %(ifo,) raise ConfigParser.Error(err_msg) return tmplt_banks