def setup_postprocprep_pipedown_workflow(workflow, coincFiles, output_dir, tags=[], do_repop=False, injectionFiles=None, vetoFiles=None, injLessTag=None, injectionTags=[], veto_cats=[]): """ Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. coincFiles : pycbc.workflow.core.FileList An FileList of the coincident trigger files that are used as input at this stage. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. do_repop : Boolean If False, use the 'coinc_inspiral.snr' column from the coincident trigger files as clustering and ranking statistic; if True, use a repop_coinc job before clustering to calculate a different ranking statistic and store in the coinc_inspiral table for later use. injectionFiles : pycbc.workflow.core.FileList (optional, default=None) The injection files to be used in this stage. An empty list (or any other input that evaluates as false) is valid and will imply that no injections are being done. vetoFiles : pycbc.workflow.core.FileList (required) The data quality files to be used in this stage. This is required and will be used to determine the analysed times when doing post-processing. injLessTag : string (required) The tag that identifies files that do not have simulations in them. Ie. the primary search results. injectionTags : list of strings (optional, default = []) Each injection file has a unique tag. If used in the method, this tells the post-processing preparation code which injection tags it should include when creating the combined output. veto_cats : list of integers (optional, default = []) Decide which set of veto files should be used in the post-processing preparation. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- finalFiles : pycbc.workflow.core.FileList A list of the single SQL database storing the clustered, injection found, triggers for all injections, time slid and zero lag analyses. initialSqlFiles : pycbc.workflow.core.FileList The SQL files before clustering is applied and injection finding performed. clusteredSqlFiles : pycbc.workflow.core.FileList The clustered SQL files before injection finding performed. combinedSqlFiles : pycbc.workflow.core.FileList A combined file containing all triggers after clustering, including the injection and veto tables, but before injection finding performed. Probably there is no need to ever keep this file and it will be a temporary file in most cases. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") # Setup needed exe classes sqliteCombine1ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner1-exe", tags) sqliteCombine1Exe = select_generic_executable(workflow, sqliteCombine1ExeTag) sqliteCombine2ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner2-exe", tags) sqliteCombine2Exe = select_generic_executable(workflow, sqliteCombine2ExeTag) clusterCoincsExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-cluster-exe", tags) clusterCoincsExe = select_generic_executable(workflow, clusterCoincsExeTag) injFindExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-injfind-exe", tags) injFindExe = select_generic_executable(workflow, injFindExeTag) sqliteCombine1Outs = FileList([]) clusterCoincsOuts = FileList([]) injFindOuts = FileList([]) sqliteCombine2Outs = FileList([]) if do_repop: repopCoincExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-repop-exe", tags) repopCoincExe = select_generic_executable(workflow, repopCoincExeTag) repopCoincOuts = FileList([]) for cat in veto_cats: # FIXME: Some hacking is still needed while we support pipedown # FIXME: There are currently 3 names to say cumulative cat_3 vetoTag = 'CUMULATIVE_CAT_%d' %(cat) dqSegFile = vetoFiles.find_output_with_tag(vetoTag) if not len(dqSegFile) == 1: errMsg = "Did not find exactly 1 data quality file." raise ValueError(errMsg) # Don't think this is used here, this is the tag *in* the file dqVetoName = 'VETO_CAT%d_CUMULATIVE' %(cat) # FIXME: Here we set the dqVetoName to be compatible with pipedown pipedownDQVetoName = 'CAT_%d_VETO' %(cat) sqliteCombine2Inputs = FileList([]) # Do injection-less jobs first. # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injLessTag, vetoTag] trigVetoInpFiles = coincFiles.find_output_with_tag(pipedownDQVetoName) trigInpFiles = trigVetoInpFiles.find_output_with_tag(injLessTag) if len(trigInpFiles) == 0: err_msg = "No input files found. Workflow would fail." raise ValueError(err_msg) trigInpFiles.append(dqSegFile[0]) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node(workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Do injection jobs for injTag in injectionTags: # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injTag, vetoTag] trigInpFiles = trigVetoInpFiles.find_output_with_tag(injTag) trigInpFiles.append(dqSegFile[0]) injFile = injectionFiles.find_output_with_tag(injTag) assert (len(injFile) == 1) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, injFile=injFile[0], injString=injTag, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node( workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Choose a new label for pegasus-clustering the jobs job_label = get_random_label() # Combine everything together and add veto file currTags = tags + [vetoTag] sqliteCombine2Job = sqliteCombine2Exe(workflow.cp, sqliteCombine2ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine2Node = sqliteCombine2Job.create_node( workflow.analysis_time, sqliteCombine2Inputs) sqliteCombine2Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine2Node) sqliteCombine2Out = sqliteCombine2Node.output_files[0] sqliteCombine2Outs.append(sqliteCombine2Out) # Inj finding injFindJob = injFindExe(workflow.cp, injFindExeTag, ifo=workflow.ifo_string, out_dir=output_dir,tags=currTags) injFindNode = injFindJob.create_node(workflow.analysis_time, sqliteCombine2Out) injFindNode.add_profile('pegasus', 'label', job_label) workflow.add_node(injFindNode) injFindOut = injFindNode.output_files[0] injFindOuts.append(injFindOut) return injFindOuts, sqliteCombine1Outs, clusterCoincsOuts,\ sqliteCombine2Outs
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable(workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' %(cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags( "workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable( workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' % (cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs