def make_gating_node(workflow, datafind_files, outdir=None, tags=None): ''' Generate jobs for autogating the data for PyGRB runs. Parameters ---------- workflow: pycbc.workflow.core.Workflow An instanced class that manages the constructed workflow. datafind_files : pycbc.workflow.core.FileList A FileList containing the frame files to be gated. outdir : string Path of the output directory tags : list of strings If given these tags are used to uniquely name and identify output files that would be produced in multiple calls to this function. Returns -------- condition_strain_nodes : list List containing the pycbc.workflow.core.Node objects representing the autogating jobs. condition_strain_outs : pycbc.workflow.core.FileList FileList containing the pycbc.workflow.core.File objects representing the gated frame files. ''' cp = workflow.cp if tags is None: tags = [] condition_strain_class = select_generic_executable(workflow, "condition_strain") condition_strain_nodes = [] condition_strain_outs = FileList([]) for ifo in workflow.ifos: input_files = FileList([datafind_file for datafind_file in \ datafind_files if datafind_file.ifo == ifo]) condition_strain_jobs = condition_strain_class(cp, "condition_strain", ifo=ifo, out_dir=outdir, tags=tags) condition_strain_node, condition_strain_out = \ condition_strain_jobs.create_node(input_files, tags=tags) condition_strain_nodes.append(condition_strain_node) condition_strain_outs.extend(FileList([condition_strain_out])) return condition_strain_nodes, condition_strain_outs
def setup_postprocprep_gstlal_workflow(workflow, coinc_files, output_dir, tags=[], injection_files=None, veto_files=None, inj_less_tag=None, injection_tags=[], veto_cat=None, summary_xml_files=None, likelihood_files=[]): """ Parameters ----------- workflow : workflow.Workflow The workflow instance that the coincidence jobs will be added to. coinc_files : workflow.FileList An FileList of the coincident trigger files that are used as input at this stage. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. injection_files : workflow.FileList (optional, default=None) The injection files to be used in this stage. An empty list (or any other input that evaluates as false) is valid and will imply that no injections are being done. veto_files : workflow.FileList (required) The data quality files to be used in this stage. This is required and will be used to determine the analysed times when doing post-processing. inj_less_tag : string (required) The tag that identifies files that do not have simulations in them. Ie. the primary search results. injection_tags : list of strings (optional, default = []) Each injection file has a unique tag. If used in the method, this tells the post-processing preparation code which injection tags it should include when creating the combined output. veto_cat : int (optional, default = None) FIXME: How does gstlal deal with veto categories? Hardcode to CAT1 for now. summary_xml_files : workflow.FileList An FileList of the output of the analysislogging_utils module. Here, this will be one file that includes the segments analysed by the workflow. Returns -------- finalFiles : workflow.FileList A list of the single SQL database storing the clustered, injection found, triggers for all injections, time slid and zero lag analyses. initialSqlFiles : workflow.FileList The SQL files before clustering is applied and injection finding performed. clusteredSqlFiles : workflow.FileList The clustered SQL files before injection finding performed. combinedSqlFiles : workflow.FileList A combined file containing all triggers after clustering, including the injection and veto tables, but before injection finding performed. Probably there is no need to ever keep this file and it will be a temporary file in most cases. """ # Sanity checks if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes run_sqlite_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-runsqlite-exe", tags) ligolw_sqlite_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-ligolwsqlite-exe", tags) inspinjfind_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-inspinjfind-exe", tags) sql_to_xml_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-sqltoxml-exe", tags) pycbc_picklehor_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-picklehor-exe", tags) pycbc_combllhood_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combllhood-exe", tags) pycbc_genranking_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-genranking-exe", tags) pycbc_compllhood_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-compllhood-exe", tags) marg_likelihood_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-marglikelihood-exe", tags) far_gstlal_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-fargstlal-exe", tags) plot_summary_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotsummary-exe", tags) plot_sensitivity_exe_name=workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotsensitivity-exe", tags) plot_background_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-plotbackground-exe", tags) summary_page_exe_name = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-summarypage-exe", tags) run_sqlite_exe = select_generic_executable(workflow, run_sqlite_exe_name) ligolw_sqlite_exe = select_generic_executable(workflow, ligolw_sqlite_exe_name) inspinjfind_exe = select_generic_executable(workflow, inspinjfind_exe_name) sql_to_xml_exe = select_generic_executable(workflow, sql_to_xml_exe_name) pycbc_picklehor_exe = select_generic_executable(workflow, pycbc_picklehor_exe_name) pycbc_combllhood_exe = select_generic_executable(workflow, pycbc_combllhood_exe_name) pycbc_genranking_exe = select_generic_executable(workflow, pycbc_genranking_exe_name) pycbc_compllhood_exe = select_generic_executable(workflow, pycbc_compllhood_exe_name) marg_likelihood_exe = select_generic_executable(workflow, marg_likelihood_exe_name) far_gstlal_exe = select_generic_executable(workflow, far_gstlal_exe_name) plot_summary_exe = select_generic_executable(workflow, plot_summary_exe_name) plot_sensitivity_exe = select_generic_executable(workflow, plot_sensitivity_exe_name) plot_background_exe = select_generic_executable(workflow, plot_background_exe_name) summary_page_exe = select_generic_executable(workflow, summary_page_exe_name) # SETUP # FIXME: Some hacking is still needed while we support pipedown # FIXME: How does gstlal deal with veto categories? # Hardcode to CAT1 for now. veto_tag = 'CUMULATIVE_CAT_%d' %(veto_cat,) dq_seg_file = veto_files.find_output_with_tag(veto_tag) assert len(dq_seg_file) == 1 dq_seg_file = dq_seg_file[0] #if not len(dqSegFile) == 1: # errMsg = "Did not find exactly 1 data quality file." # raise ValueError(errMsg) # FIXME: Here we set the dqVetoName to be compatible with pipedown pipedown_dq_veto_name = 'CAT_%d_VETO' %(veto_cat,) # First we need to covert to SQL, this is STAGE0 # Do for all injection runs and zero lag stage0_outputs = {} for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] trig_veto_inp_files = \ coinc_files.find_output_with_tag(pipedown_dq_veto_name) trig_inp_files = trig_veto_inp_files.find_output_with_tag(inj_tag) stage0_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE0'] + curr_tags) stage0_outputs[inj_tag] = FileList([]) assert len(trig_inp_files) > 0 for file in trig_inp_files: stage0_node = stage0_job.create_node(file.segment, [file]) workflow.add_node(stage0_node) # Node has only one output file stage0_out = stage0_node.output_files[0] stage0_outputs[inj_tag].append(stage0_out) curr_tags = tags + [veto_tag] # NOW WE DO LIKELIHOOD SETUP pycbc_picklehor_job = pycbc_picklehor_exe(workflow.cp, pycbc_picklehor_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) pycbc_combllhood_job = pycbc_combllhood_exe(workflow.cp, pycbc_combllhood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) pycbc_genranking_job = pycbc_genranking_exe(workflow.cp, pycbc_genranking_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) marg_likelihood_job_1 = marg_likelihood_exe(workflow.cp, marg_likelihood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['MARG1']+curr_tags) marg_likelihood_job_2 = marg_likelihood_exe(workflow.cp, marg_likelihood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['MARG2']+curr_tags) # Begin with finding the horizon distances picklehor_inputs = stage0_outputs[inj_less_tag] node = pycbc_picklehor_job.create_node(workflow.analysis_time, picklehor_inputs) workflow.add_node(node) horizon_dist_file = node.output_files[0] # Then combine all likelihood files combllhood_inputs = likelihood_files.find_output_with_tag(\ pipedown_dq_veto_name) combllhood_inputs = combllhood_inputs.find_output_with_tag(inj_less_tag) assert len(combllhood_inputs) > 0 node = pycbc_combllhood_job.create_node(workflow.analysis_time, combllhood_inputs, horizon_dist_file) workflow.add_node(node) likelihood_file = node.output_files[0] # Also compute the ranking file node = pycbc_genranking_job.create_node(workflow.analysis_time, likelihood_file, horizon_dist_file) workflow.add_node(node) ranking_likelihood_file = node.output_files[0] # And marginalize (twice for some reason!) node = marg_likelihood_job_1.create_node(workflow.analysis_time, ranking_likelihood_file) workflow.add_node(node) marg_likelihood_file_1 = node.output_files[0] node = marg_likelihood_job_2.create_node(workflow.analysis_time, marg_likelihood_file_1) workflow.add_node(node) marg_likelihood_file_2 = node.output_files[0] # Now do the sqlite conditioning. This has a few stages. # STAGE 1: Populate likelihood in all input files # STAGE 2: Run run_sqlite on all outputs of stage 1 # STAGE 3: Combine all files into one sqlite file # STAGE 4: Run run_sqlite on outputs of stage 3 # STAGE 5: Add segments.xml and inj.xml # STAGE 6: Run run_sqlite (cluster an simplify) on outputs of stage 5 # STAGE 7: Dump SQL database to xml # STAGE 8: Run injfind on the xml document # STAGE 9: Convert back to SQL stage1_outputs = {} stage2_outputs = {} stage3_outputs = {} stage4_outputs = {} stage5_outputs = {} stage6_outputs = {} stage7_outputs = {} stage8_outputs = {} stage9_outputs = {} final_outputs = FileList([]) # Do for all injection runs and zero lag for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] trig_inp_files = stage0_outputs[inj_tag] stage1_job = pycbc_compllhood_exe(workflow.cp, pycbc_compllhood_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE1']+curr_tags) stage2_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE2'] + curr_tags) stage3_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE3'] + curr_tags) stage4_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE4'] + curr_tags) stage5_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE5'] + curr_tags) if inj_tag == inj_less_tag: # For zero-lag we stop here, so use the FINAL tag to indicate this stage6_zl_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['FINAL'] + curr_tags) else: stage6_job = run_sqlite_exe(workflow.cp, run_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE6'] + curr_tags) stage7_job = sql_to_xml_exe(workflow.cp, sql_to_xml_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE7'] + curr_tags) stage8_job = inspinjfind_exe(workflow.cp, inspinjfind_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['STAGE8'] + curr_tags) stage9_job = ligolw_sqlite_exe(workflow.cp, ligolw_sqlite_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['FINAL'] + curr_tags) stage1_outputs[inj_tag] = FileList([]) stage2_outputs[inj_tag] = FileList([]) assert len(trig_inp_files) > 0 for file in trig_inp_files: stage1_node = stage1_job.create_node(file.segment, file, likelihood_file, horizon_dist_file) workflow.add_node(stage1_node) # Node has only one output file stage1_out = stage1_node.output_files[0] stage1_outputs[inj_tag].append(stage1_out) stage2_node = stage2_job.create_node(stage1_out.segment, stage1_out) workflow.add_node(stage2_node) # Node has only one output file stage2_out = stage2_node.output_files[0] stage2_outputs[inj_tag].append(stage2_out) stage3_node = stage3_job.create_node(workflow.analysis_time, stage2_outputs[inj_tag], workflow=workflow) workflow.add_node(stage3_node) # Node has only one output file stage3_out = stage3_node.output_files[0] stage3_outputs[inj_tag] = stage3_out stage4_node = stage4_job.create_node(workflow.analysis_time, stage3_out) workflow.add_node(stage4_node) # Node has only one output file stage4_out = stage4_node.output_files[0] stage4_outputs[inj_tag] = stage4_out stage5_inputs = [stage4_out] stage5_inputs.append(summary_xml_files[0]) stage5_inputs.append(dq_seg_file) if inj_tag != inj_less_tag: inj_file = injection_files.find_output_with_tag(inj_tag) assert (len(inj_file) == 1) stage5_inputs.append(inj_file[0]) stage5_node = stage5_job.create_node(workflow.analysis_time, stage5_inputs) workflow.add_node(stage5_node) # Node has only one output file stage5_out = stage5_node.output_files[0] stage5_outputs[inj_tag] = stage5_out if inj_tag == inj_less_tag: stage6_node = stage6_zl_job.create_node(workflow.analysis_time, stage5_out) workflow.add_node(stage6_node) stage6_out = stage6_node.output_files[0] stage6_outputs[inj_tag] = stage6_out final_outputs.append(stage6_out) else: stage6_node = stage6_job.create_node(workflow.analysis_time, stage5_out) workflow.add_node(stage6_node) stage6_out = stage6_node.output_files[0] stage6_outputs[inj_tag] = stage6_out stage7_node = stage7_job.create_node(workflow.analysis_time, stage6_out) workflow.add_node(stage7_node) stage7_out = stage7_node.output_files[0] stage7_outputs[inj_tag] = stage7_out stage8_node = stage8_job.create_node(workflow.analysis_time, stage7_out) workflow.add_node(stage8_node) stage8_out = stage8_node.output_files[0] stage8_outputs[inj_tag] = stage8_out stage9_node = stage9_job.create_node(workflow.analysis_time, [stage8_out]) workflow.add_node(stage9_node) stage9_out = stage9_node.output_files[0] stage9_outputs[inj_tag] = stage9_out final_outputs.append(stage9_out) # Next we run the compute FAR from snr_chisq histograms job far_gstlal_outputs = {} for inj_tag in [inj_less_tag] + injection_tags: curr_tags = tags + [inj_tag, veto_tag] far_gstlal_job = far_gstlal_exe(workflow.cp, far_gstlal_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) trig_veto_inp_files = \ final_outputs.find_output_with_tag(veto_tag) trig_inp_files = trig_veto_inp_files.find_output_with_tag(inj_tag) assert len(trig_inp_files) == 1 input_database = trig_inp_files[0] if inj_tag != inj_less_tag: no_inj_db = trig_veto_inp_files.find_output_with_tag(inj_less_tag) assert len(no_inj_db) == 1 no_inj_db = no_inj_db[0] write_background = False else: # Here I don't want to provide the same file as a dependancy # twice. Therefore I just give non-injection DB and the code # assumes this is also the input-database if it is not given. # Also, I only want the background file once no_inj_db = input_database input_database = None write_background = True far_gstlal_node = far_gstlal_job.create_node(workflow.analysis_time, no_inj_db, marg_likelihood_file_2, inj_database=input_database, write_background_bins=write_background) workflow.add_node(far_gstlal_node) outputs = far_gstlal_node.output_files if inj_tag != inj_less_tag: assert len(outputs) == 1 far_gstlal_outputs[inj_tag] = outputs[0] else: assert len(outputs) == 2 sql_out = outputs.find_output_without_tag('POSTMARG')[0] xml_out = outputs.find_output_with_tag('POSTMARG')[0] far_gstlal_outputs[inj_tag] = sql_out post_marginalized_file = xml_out # Finally some plotting. # FIXME: These are given explicit output directories and pegasus does not # know about output files. Would be nice if this was done "better" curr_tags = tags + [veto_tag] plot_summary_job = plot_summary_exe(workflow.cp, plot_summary_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) plot_sensitivity_job = plot_sensitivity_exe(workflow.cp, plot_sensitivity_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) plot_background_job = plot_background_exe(workflow.cp, plot_background_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) inj_dbs = [] for inj_tag in injection_tags: inj_dbs.append(far_gstlal_outputs[inj_tag]) non_inj_db = far_gstlal_outputs[inj_less_tag] plot_summary_node = plot_summary_job.create_node(non_inj_db, inj_dbs) plot_background_node = plot_background_job.create_node(non_inj_db, post_marginalized_file) plot_sensitivity_node = plot_sensitivity_job.create_node(non_inj_db, inj_dbs) workflow.add_node(plot_summary_node) workflow.add_node(plot_background_node) workflow.add_node(plot_sensitivity_node) # And make the html pages parents = [plot_summary_node, plot_background_node, plot_sensitivity_node] closed_summarypage_job = summary_page_exe(workflow.cp, summary_page_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['CLOSEDBOX'] + curr_tags) open_summarypage_job = summary_page_exe(workflow.cp, summary_page_exe_name, ifo=workflow.ifo_string, out_dir=output_dir, tags=['OPENBOX'] + curr_tags) closed_summarypage_node = closed_summarypage_job.create_and_add_node(\ workflow, parents) open_summarypage_node = open_summarypage_job.create_and_add_node(workflow, parents) # FIXME: Maybe contatenate and return all other outputs if needed elsewhere # FIXME: Move to pp utils and return the FAR files. return final_outputs
def setup_postprocprep_pipedown_workflow(workflow, coincFiles, output_dir, tags=[], do_repop=False, injectionFiles=None, vetoFiles=None, injLessTag=None, injectionTags=[], veto_cats=[]): """ Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. coincFiles : pycbc.workflow.core.FileList An FileList of the coincident trigger files that are used as input at this stage. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. do_repop : Boolean If False, use the 'coinc_inspiral.snr' column from the coincident trigger files as clustering and ranking statistic; if True, use a repop_coinc job before clustering to calculate a different ranking statistic and store in the coinc_inspiral table for later use. injectionFiles : pycbc.workflow.core.FileList (optional, default=None) The injection files to be used in this stage. An empty list (or any other input that evaluates as false) is valid and will imply that no injections are being done. vetoFiles : pycbc.workflow.core.FileList (required) The data quality files to be used in this stage. This is required and will be used to determine the analysed times when doing post-processing. injLessTag : string (required) The tag that identifies files that do not have simulations in them. Ie. the primary search results. injectionTags : list of strings (optional, default = []) Each injection file has a unique tag. If used in the method, this tells the post-processing preparation code which injection tags it should include when creating the combined output. veto_cats : list of integers (optional, default = []) Decide which set of veto files should be used in the post-processing preparation. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- finalFiles : pycbc.workflow.core.FileList A list of the single SQL database storing the clustered, injection found, triggers for all injections, time slid and zero lag analyses. initialSqlFiles : pycbc.workflow.core.FileList The SQL files before clustering is applied and injection finding performed. clusteredSqlFiles : pycbc.workflow.core.FileList The clustered SQL files before injection finding performed. combinedSqlFiles : pycbc.workflow.core.FileList A combined file containing all triggers after clustering, including the injection and veto tables, but before injection finding performed. Probably there is no need to ever keep this file and it will be a temporary file in most cases. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") # Setup needed exe classes sqliteCombine1ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner1-exe", tags) sqliteCombine1Exe = select_generic_executable(workflow, sqliteCombine1ExeTag) sqliteCombine2ExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-combiner2-exe", tags) sqliteCombine2Exe = select_generic_executable(workflow, sqliteCombine2ExeTag) clusterCoincsExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-cluster-exe", tags) clusterCoincsExe = select_generic_executable(workflow, clusterCoincsExeTag) injFindExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-injfind-exe", tags) injFindExe = select_generic_executable(workflow, injFindExeTag) sqliteCombine1Outs = FileList([]) clusterCoincsOuts = FileList([]) injFindOuts = FileList([]) sqliteCombine2Outs = FileList([]) if do_repop: repopCoincExeTag = workflow.cp.get_opt_tags("workflow-postprocprep", "postprocprep-repop-exe", tags) repopCoincExe = select_generic_executable(workflow, repopCoincExeTag) repopCoincOuts = FileList([]) for cat in veto_cats: # FIXME: Some hacking is still needed while we support pipedown # FIXME: There are currently 3 names to say cumulative cat_3 vetoTag = 'CUMULATIVE_CAT_%d' %(cat) dqSegFile = vetoFiles.find_output_with_tag(vetoTag) if not len(dqSegFile) == 1: errMsg = "Did not find exactly 1 data quality file." raise ValueError(errMsg) # Don't think this is used here, this is the tag *in* the file dqVetoName = 'VETO_CAT%d_CUMULATIVE' %(cat) # FIXME: Here we set the dqVetoName to be compatible with pipedown pipedownDQVetoName = 'CAT_%d_VETO' %(cat) sqliteCombine2Inputs = FileList([]) # Do injection-less jobs first. # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injLessTag, vetoTag] trigVetoInpFiles = coincFiles.find_output_with_tag(pipedownDQVetoName) trigInpFiles = trigVetoInpFiles.find_output_with_tag(injLessTag) if len(trigInpFiles) == 0: err_msg = "No input files found. Workflow would fail." raise ValueError(err_msg) trigInpFiles.append(dqSegFile[0]) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node(workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Do injection jobs for injTag in injectionTags: # Choose a label for clustering the jobs job_label = get_random_label() # Combine trig files first currTags = tags + [injTag, vetoTag] trigInpFiles = trigVetoInpFiles.find_output_with_tag(injTag) trigInpFiles.append(dqSegFile[0]) injFile = injectionFiles.find_output_with_tag(injTag) assert (len(injFile) == 1) sqliteCombine1Job = sqliteCombine1Exe(workflow.cp, sqliteCombine1ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine1Node = sqliteCombine1Job.create_node( workflow.analysis_time, trigInpFiles, injFile=injFile[0], injString=injTag, workflow=workflow) sqliteCombine1Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine1Node) # Node has only one output file sqliteCombine1Out = sqliteCombine1Node.output_files[0] sqliteCombine1Outs.append(sqliteCombine1Out) if do_repop: repopCoincJob = repopCoincExe(workflow.cp, repopCoincExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) repopCoincNode = repopCoincJob.create_node( workflow.analysis_time, sqliteCombine1Out) repopCoincNode.add_profile('pegasus', 'label', job_label) workflow.add_node(repopCoincNode) # Node has only one output file repopCoincOut = repopCoincNode.output_files[0] repopCoincOuts.append(repopCoincOut) # Input file plumbing allowing for possible repop_coinc job clusterCoincsIn = repopCoincOut if do_repop else sqliteCombine1Out # Cluster coincidences clusterCoincsJob = clusterCoincsExe(workflow.cp, clusterCoincsExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) clusterCoincsNode = clusterCoincsJob.create_node( workflow.analysis_time, clusterCoincsIn) clusterCoincsNode.add_profile('pegasus', 'label', job_label) workflow.add_node(clusterCoincsNode) # Node has only one output file clusterCoincsOut = clusterCoincsNode.output_files[0] clusterCoincsOuts.append(clusterCoincsOut) sqliteCombine2Inputs.append(clusterCoincsOut) # Choose a new label for pegasus-clustering the jobs job_label = get_random_label() # Combine everything together and add veto file currTags = tags + [vetoTag] sqliteCombine2Job = sqliteCombine2Exe(workflow.cp, sqliteCombine2ExeTag, ifo=workflow.ifo_string, out_dir=output_dir, tags=currTags) sqliteCombine2Node = sqliteCombine2Job.create_node( workflow.analysis_time, sqliteCombine2Inputs) sqliteCombine2Node.add_profile('pegasus', 'label', job_label) workflow.add_node(sqliteCombine2Node) sqliteCombine2Out = sqliteCombine2Node.output_files[0] sqliteCombine2Outs.append(sqliteCombine2Out) # Inj finding injFindJob = injFindExe(workflow.cp, injFindExeTag, ifo=workflow.ifo_string, out_dir=output_dir,tags=currTags) injFindNode = injFindJob.create_node(workflow.analysis_time, sqliteCombine2Out) injFindNode.add_profile('pegasus', 'label', job_label) workflow.add_node(injFindNode) injFindOut = injFindNode.output_files[0] injFindOuts.append(injFindOut) return injFindOuts, sqliteCombine1Outs, clusterCoincsOuts,\ sqliteCombine2Outs
def setup_postproc_coh_PTF_workflow(workflow, trig_files, trig_cache, inj_trig_files, inj_files, inj_trig_caches, inj_caches, config_file, output_dir, html_dir, segment_dir, ifos, inj_tags=[], tags=[]): """ This module sets up the post-processing stage in the workflow, using a coh_PTF style set up. This consists of running trig_combiner to find coherent triggers, and injfinder to look for injections. It then runs a horizon_dist job, trig_cluster to cluster triggers, and injcombiner to calculate injection statistics. Finally, efficiency and sbv_plotter jobs calculate efficiency and signal based veto statistics and make plots. workflow : pycbc.workflow.core.Workflow The Workflow instance that the jobs will be added to. trig_files : pycbc.workflow.core.FileList A FileList containing the combined databases. Returns -------- """ cp = workflow.cp full_segment = trig_files[0].segment trig_name = cp.get("workflow", "trigger-name") grb_string = "GRB" + trig_name num_trials = int(cp.get("trig_combiner", "num-trials")) pp_outs = FileList([]) pp_nodes = [] # Set up needed exe classes trig_combiner_exe = os.path.basename(cp.get("executables", "trig_combiner")) trig_combiner_class = select_generic_executable(workflow, "trig_combiner") trig_cluster_exe = os.path.basename(cp.get("executables", "trig_cluster")) trig_cluster_class = select_generic_executable(workflow, "trig_cluster") sbv_plotter_exe = os.path.basename(cp.get("executables", "sbv_plotter")) sbv_plotter_class = select_generic_executable(workflow, "sbv_plotter") efficiency_exe = os.path.basename(cp.get("executables", "efficiency")) efficiency_class = select_generic_executable(workflow, "efficiency") """ horizon_dist_exe = os.path.basename(cp.get("executables", "horizon_dist")) horizon_dist_class = select_generic_executable(workflow, "horizon_dist") """ html_summary_exe = os.path.basename(cp.get("executables", "html_summary")) html_summary_class = select_generic_executable(workflow, "html_summary") # Set up trig_combiner job trig_combiner_out_tags = ["OFFSOURCE", "ONSOURCE", "ALL_TIMES"] trig_combiner_jobs = trig_combiner_class(cp, "trig_combiner", ifo=ifos, out_dir=output_dir, tags=tags) trig_combiner_node, trig_combiner_outs = trig_combiner_jobs.create_node(\ trig_files, segment_dir, out_tags=trig_combiner_out_tags, tags=tags) pp_nodes.append(trig_combiner_node) workflow.add_node(trig_combiner_node) pp_outs.extend(trig_combiner_outs) # Initialise trig_cluster class trig_cluster_outs = FileList([]) trig_cluster_jobs = trig_cluster_class(cp, "trig_cluster", ifo=ifos, out_dir=output_dir, tags=tags) # Set up injfinder jobs if cp.has_section("workflow-injections"): injfinder_nodes = [] injcombiner_parent_nodes = [] injfinder_exe = os.path.basename(cp.get("executables", "injfinder")) injfinder_class = select_generic_executable(workflow, "injfinder") injfinder_jobs = injfinder_class(cp, "injfinder", ifo=ifos, out_dir=output_dir, tags=tags) injcombiner_exe = os.path.basename(cp.get("executables", "injcombiner")) injcombiner_class = select_generic_executable(workflow, "injcombiner") injcombiner_jobs = injcombiner_class(cp, "injcombiner", ifo=ifos, out_dir=output_dir, tags=tags) injfinder_outs = FileList([]) for inj_tag in inj_tags: triggers = FileList([file for file in inj_trig_files \ if inj_tag in file.tag_str]) injections = FileList([file for file in inj_files \ if inj_tag in file.tag_str]) trig_cache = [file for file in inj_trig_caches \ if inj_tag in file.tag_str][0] inj_cache = [file for file in inj_caches \ if inj_tag in file.tag_str][0] injfinder_node, curr_outs = injfinder_jobs.create_node(\ triggers, injections, segment_dir, tags=[inj_tag]) injfinder_nodes.append(injfinder_node) pp_nodes.append(injfinder_node) workflow.add_node(injfinder_node) injfinder_outs.extend(curr_outs) if "DETECTION" not in curr_outs[0].tag_str: injcombiner_parent_nodes.append(injfinder_node) pp_outs.extend(injfinder_outs) # Make injfinder output cache fm_cache = File(ifos, "foundmissed", full_segment, extension="lcf", directory=output_dir) fm_cache.PFN(fm_cache.cache_entry.path, site="local") injfinder_outs.convert_to_lal_cache().tofile(\ open(fm_cache.storage_path, "w")) pp_outs.extend(FileList([fm_cache])) # Set up injcombiner jobs injcombiner_outs = FileList([file for file in injfinder_outs \ if "DETECTION" in file.tag_str]) injcombiner_tags = [inj_tag for inj_tag in inj_tags \ if "DETECTION" not in inj_tag] injcombiner_out_tags = [injcombiner_outs[0].tag_str.rsplit('_', 1)[0]] injcombiner_nodes = [] for injcombiner_tag in injcombiner_tags: max_inc = cp.get_opt_tags("injections", "max-inc", [injcombiner_tag]) inj_str = injcombiner_tag[:4] inputs = FileList([file for file in injfinder_outs \ if injcombiner_tag in file.tagged_description]) # if any(tag in file.tagged_description \ # for tag in injcombiner_tags)]) injcombiner_node, curr_outs = injcombiner_jobs.create_node(\ fm_cache, inputs, inj_str, max_inc, workflow.analysis_time) injcombiner_nodes.append(injcombiner_node) injcombiner_out_tags.append("%s_FILTERED_%s" % (inj_str, max_inc)) injcombiner_outs.extend(curr_outs) pp_outs.extend(curr_outs) pp_nodes.append(injcombiner_node) workflow.add_node(injcombiner_node) for parent_node in injcombiner_parent_nodes: dep = dax.Dependency(parent=parent_node._dax_node, child=injcombiner_node._dax_node) workflow._adag.addDependency(dep) # Initialise injection_efficiency class inj_efficiency_jobs = efficiency_class(cp, "inj_efficiency", ifo=ifos, out_dir=output_dir, tags=tags) # Initialise sbv_plotter class sbv_plotter_outs = FileList([]) sbv_plotter_jobs = sbv_plotter_class(cp, "sbv_plotter", ifo=ifos, out_dir=output_dir, tags=tags) # Initialise efficiency class efficiency_outs = FileList([]) efficiency_jobs = efficiency_class(cp, "efficiency", ifo=ifos, out_dir=output_dir, tags=tags) # Initialise html_summary class html_summary_jobs = html_summary_class(cp, "html_summary", ifo=ifos, out_dir=output_dir, tags=tags) # Add trig_cluster jobs and their corresponding plotting jobs for out_tag in trig_combiner_out_tags: unclust_file = [file for file in trig_combiner_outs \ if out_tag in file.tag_str][0] trig_cluster_node, curr_outs = trig_cluster_jobs.create_node(\ unclust_file) trig_cluster_outs.extend(curr_outs) clust_file = curr_outs[0] if out_tag != "ONSOURCE": # Add memory requirememnt for jobs with potentially large files trig_cluster_node.set_memory(1300) pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add sbv_plotter job sbv_out_tags = [out_tag, "_clustered"] sbv_plotter_node = sbv_plotter_jobs.create_node(clust_file, segment_dir, tags=sbv_out_tags) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) if out_tag == "OFFSOURCE": offsource_clustered = clust_file off_node = sbv_plotter_node found_inj_files = FileList([file for file in injcombiner_outs \ if "FOUND" in file.tag_str]) for curr_injs in found_inj_files: curr_tags = [tag for tag in injcombiner_out_tags \ if tag in curr_injs.name] curr_tags.append("_clustered") sbv_plotter_node = sbv_plotter_jobs.create_node(clust_file, segment_dir, inj_file=curr_injs, tags=curr_tags) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) for parent_node in injcombiner_nodes: dep = dax.Dependency(parent=parent_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) # Also add sbv_plotter job for unclustered triggers sbv_plotter_node = sbv_plotter_jobs.create_node(unclust_file, segment_dir, tags=[out_tag, "_unclustered"]) sbv_plotter_node.set_memory(1300) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) else: pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add efficiency job for on/off efficiency_node = efficiency_jobs.create_node(clust_file, offsource_clustered, segment_dir, tags=[out_tag]) pp_nodes.append(efficiency_node) workflow.add_node(efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) if cp.has_section("workflow-injections"): for tag in injcombiner_out_tags: found_file = [file for file in injcombiner_outs \ if tag + "_FOUND" in file.tag_str][0] missed_file = [file for file in injcombiner_outs \ if tag + "_MISSED" in file.tag_str][0] inj_efficiency_node = inj_efficiency_jobs.create_node(\ clust_file, offsource_clustered, segment_dir, found_file, missed_file, tags=[out_tag, tag]) pp_nodes.append(inj_efficiency_node) workflow.add_node(inj_efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injcombiner_node in injcombiner_nodes: dep = dax.Dependency(parent=injcombiner_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injfinder_node in injfinder_nodes: dep = dax.Dependency(parent=injfinder_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) # Add further trig_cluster jobs for trials trial = 1 while trial <= num_trials: trial_tag = "OFFTRIAL_%d" % trial unclust_file = [file for file in trig_combiner_outs \ if trial_tag in file.tag_str][0] trig_cluster_node, clust_outs = trig_cluster_jobs.create_node(\ unclust_file) clust_file = clust_outs[0] trig_cluster_outs.extend(clust_outs) pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add efficiency job efficiency_node = efficiency_jobs.create_node(clust_file, offsource_clustered, segment_dir, tags=[trial_tag]) pp_nodes.append(efficiency_node) workflow.add_node(efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) # Adding inj_efficiency job if cp.has_section("workflow-injections"): for tag in injcombiner_out_tags: found_file = [file for file in injcombiner_outs \ if tag + "_FOUND" in file.tag_str][0] missed_file = [file for file in injcombiner_outs \ if tag + "_MISSED" in file.tag_str][0] inj_efficiency_node = inj_efficiency_jobs.create_node(\ clust_file, offsource_clustered, segment_dir, found_file, missed_file, tags=[trial_tag, tag]) pp_nodes.append(inj_efficiency_node) workflow.add_node(inj_efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injcombiner_node in injcombiner_nodes: dep = dax.Dependency(parent=injcombiner_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injfinder_node in injfinder_nodes: dep = dax.Dependency(parent=injfinder_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) trial += 1 # Initialise html_summary class and set up job #FIXME: We may want this job to run even if some jobs fail html_summary_jobs = html_summary_class(cp, "html_summary", ifo=ifos, out_dir=output_dir, tags=tags) if cp.has_section("workflow-injections"): tuning_tags = [inj_tag for inj_tag in injcombiner_out_tags \ if "DETECTION" in inj_tag] exclusion_tags = [inj_tag for inj_tag in injcombiner_out_tags \ if "DETECTION" not in inj_tag] html_summary_node = html_summary_jobs.create_node(c_file=config_file, tuning_tags=tuning_tags, exclusion_tags=exclusion_tags, html_dir=html_dir) else: html_summary_node = html_summary_jobs.create_node(c_file=config_file, html_dir=html_dir) workflow.add_node(html_summary_node) for pp_node in pp_nodes: dep = dax.Dependency(parent=pp_node._dax_node, child=html_summary_node._dax_node) workflow._adag.addDependency(dep) # Make the open box shell script open_box_cmd = ' '.join(html_summary_node.get_command_line()) open_box_cmd += "--open-box" open_box_path = "%s/open_the_box.sh" % output_dir f = open(open_box_path, "w") f.write("#!/bin/sh\n%s" % open_box_cmd) f.close() os.chmod(open_box_path, 0500) pp_outs.extend(trig_cluster_outs) return pp_outs
def setup_timeslides_workflow(workflow, output_dir=None, tags=[], timeSlideSectionName='ligolw_tisi'): ''' Setup generation of time_slide input files in the workflow. Currently used only with ligolw_tisi to generate files containing the list of slides to be performed in each time slide job. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. This will be used in output names. timeSlideSectionName : string (optional, default='injections') The string that corresponds to the option describing the exe location in the [executables] section of the .ini file and that corresponds to the section (and sub-sections) giving the options that will be given to the code at run time. Returns -------- timeSlideOuts : pycbc.workflow.core.FileList The list of time slide files created by this call. ''' logging.info("Entering time slides setup module.") make_analysis_dir(output_dir) # Get ifo list and full analysis segment for output file naming ifoList = workflow.ifos ifo_string = workflow.ifo_string fullSegment = workflow.analysis_time # Identify which time-slides to do by presence of sub-sections in the # configuration file all_sec = workflow.cp.sections() timeSlideSections = [sec for sec in all_sec if sec.startswith('tisi-')] timeSlideTags = [(sec.split('-')[-1]).upper() for sec in timeSlideSections] timeSlideOuts = FileList([]) # FIXME: Add ability to specify different exes # Make the timeSlideFiles for timeSlideTag in timeSlideTags: currTags = tags + [timeSlideTag] timeSlideMethod = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-method", currTags) if timeSlideMethod in ["IN_WORKFLOW", "AT_RUNTIME"]: timeSlideExeTag = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-exe", currTags) timeSlideExe = select_generic_executable(workflow, timeSlideExeTag) timeSlideJob = timeSlideExe(workflow.cp, timeSlideExeTag, ifos=ifo_string, tags=currTags, out_dir=output_dir) timeSlideNode = timeSlideJob.create_node(fullSegment) if timeSlideMethod == "AT_RUNTIME": workflow.execute_node(timeSlideNode) else: workflow.add_node(timeSlideNode) tisiOutFile = timeSlideNode.output_files[0] elif timeSlideMethod == "PREGENERATED": timeSlideFilePath = workflow.cp.get_opt_tags("workflow-timeslides", "timeslides-pregenerated-file", currTags) file_url = urlparse.urljoin('file:', urllib.pathname2url(\ timeSlideFilePath)) tisiOutFile = File(ifoString, 'PREGEN_TIMESLIDES', fullSegment, file_url, tags=currTags) timeSlideOuts.append(tisiOutFile) return timeSlideOuts
def setup_postproc_coh_PTF_workflow(workflow, trig_files, trig_cache, inj_trig_files, inj_files, inj_trig_caches, inj_caches, config_file, output_dir, html_dir, segment_dir, ifos, inj_tags=[], tags=[]): """ This module sets up the post-processing stage in the workflow, using a coh_PTF style set up. This consists of running trig_combiner to find coherent triggers, and injfinder to look for injections. It then runs a horizon_dist job, trig_cluster to cluster triggers, and injcombiner to calculate injection statistics. Finally, efficiency and sbv_plotter jobs calculate efficiency and signal based veto statistics and make plots. workflow : pycbc.workflow.core.Workflow The Workflow instance that the jobs will be added to. trig_files : pycbc.workflow.core.FileList A FileList containing the combined databases. Returns -------- """ cp = workflow.cp full_segment = trig_files[0].segment trig_name = cp.get("workflow", "trigger-name") grb_string = "GRB" + trig_name num_trials = int(cp.get("trig_combiner", "num-trials")) pp_outs = FileList([]) pp_nodes = [] # Set up needed exe classes trig_combiner_exe = os.path.basename(cp.get("executables", "trig_combiner")) trig_combiner_class = select_generic_executable(workflow, "trig_combiner") trig_cluster_exe = os.path.basename(cp.get("executables", "trig_cluster")) trig_cluster_class = select_generic_executable(workflow, "trig_cluster") sbv_plotter_exe = os.path.basename(cp.get("executables", "sbv_plotter")) sbv_plotter_class = select_generic_executable(workflow, "sbv_plotter") efficiency_exe = os.path.basename(cp.get("executables", "efficiency")) efficiency_class = select_generic_executable(workflow, "efficiency") """ horizon_dist_exe = os.path.basename(cp.get("executables", "horizon_dist")) horizon_dist_class = select_generic_executable(workflow, "horizon_dist") """ html_summary_exe = os.path.basename(cp.get("executables", "html_summary")) html_summary_class = select_generic_executable(workflow, "html_summary") # Set up trig_combiner job trig_combiner_out_tags = ["OFFSOURCE", "ONSOURCE", "ALL_TIMES"] trig_combiner_jobs = trig_combiner_class(cp, "trig_combiner", ifo=ifos, out_dir=output_dir, tags=tags) trig_combiner_node, trig_combiner_outs = trig_combiner_jobs.create_node(\ trig_files, segment_dir, out_tags=trig_combiner_out_tags, tags=tags) pp_nodes.append(trig_combiner_node) workflow.add_node(trig_combiner_node) pp_outs.extend(trig_combiner_outs) # Initialise trig_cluster class trig_cluster_outs = FileList([]) trig_cluster_jobs = trig_cluster_class(cp, "trig_cluster", ifo=ifos, out_dir=output_dir, tags=tags) # Set up injfinder jobs if cp.has_section("workflow-injections"): injfinder_nodes = [] injcombiner_parent_nodes = [] inj_sbv_plotter_parent_nodes = [] injfinder_exe = os.path.basename(cp.get("executables", "injfinder")) injfinder_class = select_generic_executable(workflow, "injfinder") injfinder_jobs = injfinder_class(cp, "injfinder", ifo=ifos, out_dir=output_dir, tags=tags) injcombiner_exe = os.path.basename(cp.get("executables", "injcombiner")) injcombiner_class = select_generic_executable(workflow, "injcombiner") injcombiner_jobs = injcombiner_class(cp, "injcombiner", ifo=ifos, out_dir=output_dir, tags=tags) injfinder_outs = FileList([]) for inj_tag in inj_tags: triggers = FileList([file for file in inj_trig_files \ if inj_tag in file.tag_str]) injections = FileList([file for file in inj_files \ if inj_tag in file.tag_str]) trig_cache = [file for file in inj_trig_caches \ if inj_tag in file.tag_str][0] inj_cache = [file for file in inj_caches \ if inj_tag in file.tag_str][0] injfinder_node, curr_outs = injfinder_jobs.create_node(\ triggers, injections, segment_dir, tags=[inj_tag]) injfinder_nodes.append(injfinder_node) pp_nodes.append(injfinder_node) workflow.add_node(injfinder_node) injfinder_outs.extend(curr_outs) if "DETECTION" not in curr_outs[0].tagged_description: injcombiner_parent_nodes.append(injfinder_node) else: inj_sbv_plotter_parent_nodes.append(injfinder_node) pp_outs.extend(injfinder_outs) # Make injfinder output cache fm_cache = File(ifos, "foundmissed", full_segment, extension="lcf", directory=output_dir) fm_cache.PFN(fm_cache.cache_entry.path, site="local") injfinder_outs.convert_to_lal_cache().tofile(\ open(fm_cache.storage_path, "w")) pp_outs.extend(FileList([fm_cache])) # Set up injcombiner jobs injcombiner_outs = FileList([file for file in injfinder_outs \ if "DETECTION" in file.tag_str]) injcombiner_tags = [inj_tag for inj_tag in inj_tags \ if "DETECTION" not in inj_tag] injcombiner_out_tags = [injcombiner_outs[0].tag_str.rsplit('_', 1)[0]] injcombiner_nodes = [] for injcombiner_tag in injcombiner_tags: max_inc = cp.get_opt_tags("injections", "max-inc", [injcombiner_tag]) inj_str = injcombiner_tag[:4] inputs = FileList([file for file in injfinder_outs \ if injcombiner_tag in file.tagged_description]) # if any(tag in file.tagged_description \ # for tag in injcombiner_tags)]) injcombiner_node, curr_outs = injcombiner_jobs.create_node(\ fm_cache, inputs, inj_str, max_inc, workflow.analysis_time) injcombiner_nodes.append(injcombiner_node) injcombiner_out_tags.append("%s_FILTERED_%s" % (inj_str, max_inc)) injcombiner_outs.extend(curr_outs) pp_outs.extend(curr_outs) pp_nodes.append(injcombiner_node) workflow.add_node(injcombiner_node) for parent_node in injcombiner_parent_nodes: dep = dax.Dependency(parent=parent_node._dax_node, child=injcombiner_node._dax_node) workflow._adag.addDependency(dep) # Initialise injection_efficiency class inj_efficiency_jobs = efficiency_class(cp, "inj_efficiency", ifo=ifos, out_dir=output_dir, tags=tags) # Initialise sbv_plotter class sbv_plotter_outs = FileList([]) sbv_plotter_jobs = sbv_plotter_class(cp, "sbv_plotter", ifo=ifos, out_dir=output_dir, tags=tags) # Initialise efficiency class efficiency_outs = FileList([]) efficiency_jobs = efficiency_class(cp, "efficiency", ifo=ifos, out_dir=output_dir, tags=tags) # Add trig_cluster jobs and their corresponding plotting jobs for out_tag in trig_combiner_out_tags: unclust_file = [file for file in trig_combiner_outs \ if out_tag in file.tag_str][0] trig_cluster_node, curr_outs = trig_cluster_jobs.create_node(\ unclust_file) trig_cluster_outs.extend(curr_outs) clust_file = curr_outs[0] if out_tag != "ONSOURCE": # Add memory requirememnt for jobs with potentially large files trig_cluster_node.set_memory(1300) pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add sbv_plotter job sbv_out_tags = [out_tag, "_clustered"] sbv_plotter_node = sbv_plotter_jobs.create_node(clust_file, segment_dir, tags=sbv_out_tags) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) # Add injection sbv_plotter nodes if appropriate if out_tag == "OFFSOURCE" and \ cp.has_section("workflow-injections"): offsource_clustered = clust_file off_node = sbv_plotter_node found_inj_files = FileList([file for file in injcombiner_outs \ if "FOUND" in file.tag_str]) for curr_injs in found_inj_files: curr_tags = [tag for tag in injcombiner_out_tags \ if tag in curr_injs.name] curr_tags.append("_clustered") sbv_plotter_node = sbv_plotter_jobs.create_node( clust_file, segment_dir, inj_file=curr_injs, tags=curr_tags) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) if "DETECTION" in curr_injs.tagged_description: for parent_node in inj_sbv_plotter_parent_nodes: dep = dax.Dependency( parent=parent_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) else: for parent_node in injcombiner_nodes: dep = dax.Dependency( parent=parent_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) # Also add sbv_plotter job for unclustered triggers sbv_plotter_node = sbv_plotter_jobs.create_node( unclust_file, segment_dir, tags=[out_tag, "_unclustered"]) sbv_plotter_node.set_memory(1300) pp_nodes.append(sbv_plotter_node) workflow.add_node(sbv_plotter_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=sbv_plotter_node._dax_node) workflow._adag.addDependency(dep) else: pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add efficiency job for on/off efficiency_node = efficiency_jobs.create_node(clust_file, offsource_clustered, segment_dir, tags=[out_tag]) pp_nodes.append(efficiency_node) workflow.add_node(efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) if cp.has_section("workflow-injections"): for tag in injcombiner_out_tags: if "_FILTERED_" in tag: inj_set_tag = [t for t in inj_tags if \ str(tag).replace("_FILTERED_", "") \ in t][0] else: inj_set_tag = str(tag) found_file = [file for file in injcombiner_outs \ if tag + "_FOUND" in file.tag_str][0] missed_file = [file for file in injcombiner_outs \ if tag + "_MISSED" in file.tag_str][0] inj_efficiency_node = inj_efficiency_jobs.create_node(\ clust_file, offsource_clustered, segment_dir, found_file, missed_file, tags=[out_tag, tag, inj_set_tag]) pp_nodes.append(inj_efficiency_node) workflow.add_node(inj_efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injcombiner_node in injcombiner_nodes: dep = dax.Dependency( parent=injcombiner_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injfinder_node in injfinder_nodes: dep = dax.Dependency( parent=injfinder_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) # Add further trig_cluster jobs for trials trial = 1 while trial <= num_trials: trial_tag = "OFFTRIAL_%d" % trial unclust_file = [file for file in trig_combiner_outs \ if trial_tag in file.tag_str][0] trig_cluster_node, clust_outs = trig_cluster_jobs.create_node(\ unclust_file) clust_file = clust_outs[0] trig_cluster_outs.extend(clust_outs) pp_nodes.append(trig_cluster_node) workflow.add_node(trig_cluster_node) dep = dax.Dependency(parent=trig_combiner_node._dax_node, child=trig_cluster_node._dax_node) workflow._adag.addDependency(dep) # Add efficiency job efficiency_node = efficiency_jobs.create_node(clust_file, offsource_clustered, segment_dir, tags=[trial_tag]) pp_nodes.append(efficiency_node) workflow.add_node(efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) dep = dax.Dependency(parent=trig_cluster_node._dax_node, child=efficiency_node._dax_node) workflow._adag.addDependency(dep) # Adding inj_efficiency job if cp.has_section("workflow-injections"): for tag in injcombiner_out_tags: if "_FILTERED_" in tag: inj_set_tag = [t for t in inj_tags if \ str(tag).replace("_FILTERED_", "") in t][0] else: inj_set_tag = str(tag) found_file = [file for file in injcombiner_outs \ if tag + "_FOUND" in file.tag_str][0] missed_file = [file for file in injcombiner_outs \ if tag + "_MISSED" in file.tag_str][0] inj_efficiency_node = inj_efficiency_jobs.create_node(\ clust_file, offsource_clustered, segment_dir, found_file, missed_file, tags=[trial_tag, tag, inj_set_tag]) pp_nodes.append(inj_efficiency_node) workflow.add_node(inj_efficiency_node) dep = dax.Dependency(parent=off_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injcombiner_node in injcombiner_nodes: dep = dax.Dependency(parent=injcombiner_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) for injfinder_node in injfinder_nodes: dep = dax.Dependency(parent=injfinder_node._dax_node, child=inj_efficiency_node._dax_node) workflow._adag.addDependency(dep) trial += 1 # Initialise html_summary class and set up job #FIXME: We may want this job to run even if some jobs fail html_summary_jobs = html_summary_class(cp, "html_summary", ifo=ifos, out_dir=output_dir, tags=tags) if cp.has_section("workflow-injections"): tuning_tags = [inj_tag for inj_tag in injcombiner_out_tags \ if "DETECTION" in inj_tag] exclusion_tags = [inj_tag for inj_tag in injcombiner_out_tags \ if "DETECTION" not in inj_tag] html_summary_node = html_summary_jobs.create_node( c_file=config_file, tuning_tags=tuning_tags, exclusion_tags=exclusion_tags, html_dir=html_dir) else: html_summary_node = html_summary_jobs.create_node(c_file=config_file, html_dir=html_dir) workflow.add_node(html_summary_node) for pp_node in pp_nodes: dep = dax.Dependency(parent=pp_node._dax_node, child=html_summary_node._dax_node) workflow._adag.addDependency(dep) # Make the open box shell script open_box_cmd = html_summary_node.executable.get_pfn() + " " open_box_cmd += ' '.join(html_summary_node._args + \ html_summary_node._options) open_box_cmd += " --open-box" open_box_path = "%s/open_the_box.sh" % output_dir f = open(open_box_path, "w") f.write("#!/bin/sh\n%s" % open_box_cmd) f.close() os.chmod(open_box_path, 0500) pp_outs.extend(trig_cluster_outs) return pp_outs
def setup_injection_workflow(workflow, output_dir=None, inj_section_name='injections', exttrig_file=None, tags=None): """ This function is the gateway for setting up injection-generation jobs in a workflow. It should be possible for this function to support a number of different ways/codes that could be used for doing this, however as this will presumably stay as a single call to a single code (which need not be inspinj) there are currently no subfunctions in this moudle. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. output_dir : path The directory in which injection files will be stored. inj_section_name : string (optional, default='injections') The string that corresponds to the option describing the exe location in the [executables] section of the .ini file and that corresponds to the section (and sub-sections) giving the options that will be given to the code at run time. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. This will be used in output names. Returns -------- inj_files : pycbc.workflow.core.FileList The list of injection files created by this call. inj_tags : list of strings The tag corresponding to each injection file and used to uniquely identify them. The FileList class contains functions to search based on tags. """ if tags is None: tags = [] logging.info("Entering injection module.") make_analysis_dir(output_dir) # Get full analysis segment for output file naming full_segment = workflow.analysis_time ifos = workflow.ifos # Identify which injections to do by presence of sub-sections in # the configuration file inj_tags = [] inj_files = FileList([]) for section in workflow.cp.get_subsections(inj_section_name): inj_tag = section.upper() curr_tags = tags + [inj_tag] # Parse for options in ini file injection_method = workflow.cp.get_opt_tags("workflow-injections", "injections-method", curr_tags) if injection_method in ["IN_WORKFLOW", "AT_RUNTIME"]: exe = select_generic_executable(workflow, 'injections') inj_job = exe(workflow.cp, inj_section_name, out_dir=output_dir, ifos='HL', tags=curr_tags) node = inj_job.create_node(full_segment) if injection_method == "AT_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] inj_files.append(inj_file) elif injection_method == "PREGENERATED": file_attrs = { 'ifos': ['HL'], 'segs': full_segment, 'tags': curr_tags } injection_path = workflow.cp.get_opt_tags( "workflow-injections", "injections-pregenerated-file", curr_tags) curr_file = resolve_url_to_file(injection_path, attrs=file_attrs) inj_files.append(curr_file) elif injection_method in ["IN_COH_PTF_WORKFLOW", "AT_COH_PTF_RUNTIME"]: inj_job = LalappsInspinjExecutable(workflow.cp, inj_section_name, out_dir=output_dir, ifos=ifos, tags=curr_tags) node = inj_job.create_node(full_segment, exttrig_file) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "em-bright-only"): em_filter_job = PycbcDarkVsBrightInjectionsExecutable( workflow.cp, 'em_bright_filter', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = em_filter_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "do-jitter-skyloc"): jitter_job = LigolwCBCJitterSkylocExecutable( workflow.cp, 'jitter_skyloc', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = jitter_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] if workflow.cp.has_option("workflow-injections", "do-align-total-spin"): align_job = LigolwCBCAlignTotalSpinExecutable( workflow.cp, 'align_total_spin', tags=curr_tags, out_dir=output_dir, ifos=ifos) node = align_job.create_node(inj_file, full_segment, curr_tags) if injection_method == "AT_COH_PTF_RUNTIME": workflow.execute_node(node) else: workflow.add_node(node) inj_file = node.output_files[0] inj_files.append(inj_file) else: err = "Injection method must be one of IN_WORKFLOW, " err += "AT_RUNTIME or PREGENERATED. Got %s." % (injection_method) raise ValueError(err) inj_tags.append(inj_tag) logging.info("Leaving injection module.") return inj_files, inj_tags
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags( "workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable( workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' % (cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs
def setup_postproc_pipedown_workflow(workflow, trigger_files, summary_xml_files, output_dir, tags=[], veto_cats=[]): """ This module sets up the post-processing stage in the workflow, using a pipedown style set up. This consists of running compute_durations to determine and store the analaysis time (foreground and background). It then runs cfar jobs to determine the false alarm rate for all triggers (simulations or otherwise) in the input database. Pipedown expects to take as input (at this stage) a single database containing all triggers. This sub-module follows that same idea, so len(triggerFiles) must equal 1 (for every DQ category that we will run). Parameters ---------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. trigger_files : pycbc.workflow.core.FileList An FileList containing the combined databases at CAT_1,2,3... that will be used to calculate FARs summary_xml_files : pycbc.workflow.core.FileList (required) A FileList of the output of the analysislogging_utils module. For pipedown-style post-processing this should be one file containing a segment table holding the single detector analysed times. output_dir : path The directory in which output files will be stored. tags : list of strings (optional, default = []) A list of the tagging strings that will be used for all jobs created by this call to the workflow. An example might be ['POSTPROC1'] or ['DENTYSNEWPOSTPROC']. This will be used in output names. veto_cats : list of integers (default = [], non-empty list required) Decide which veto category levels should be used in post-processing. For example tell the workflow to only generate results at cumulative categories 2, 3 and 4 by supplying [2,3,4] here. Returns -------- final_files : pycbc.workflow.core.FileList A list of the final SQL databases containing computed FARs. """ if not veto_cats: raise ValueError("A non-empty list of veto categories is required.") if not len(summary_xml_files) == 1: errMsg = "I need exactly one summaryXML file, got %d." \ %(len(summary_xml_files),) raise ValueError(errMsg) # Setup needed exe classes compute_durations_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-computedurations-exe", tags) compute_durations_exe = select_generic_executable(workflow, compute_durations_exe_tag) cfar_exe_tag = workflow.cp.get_opt_tags("workflow-postproc", "postproc-cfar-exe", tags) cfar_exe = select_generic_executable(workflow, cfar_exe_tag) comp_durations_outs = FileList([]) cfar_outs = FileList([]) for cat in veto_cats: veto_tag = 'CUMULATIVE_CAT_%d' %(cat) trig_input_files = trigger_files.find_output_with_tag(veto_tag) if not len(trig_input_files) == 1: err_msg = "Did not find exactly 1 database input file." raise ValueError(err_msg) curr_tags = tags + [veto_tag] # Choose a label for clustering the jobs job_label = get_random_label() # Start with compute durations computeDurationsJob = compute_durations_exe(workflow.cp, compute_durations_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) compute_durations_node = computeDurationsJob.create_node( workflow.analysis_time, trig_input_files[0], summary_xml_files[0]) compute_durations_node.add_profile('pegasus', 'label', job_label) workflow.add_node(compute_durations_node) # Node has only one output file compute_durations_out = compute_durations_node.output_files[0] comp_durations_outs.append(compute_durations_out) # Add the calculate FAR (cfar) job cfar_job = cfar_exe(workflow.cp, cfar_exe_tag, ifo=workflow.ifo_string, out_dir=output_dir, tags=curr_tags) cfar_node = cfar_job.create_node(workflow.analysis_time, compute_durations_out) cfar_node.add_profile('pegasus', 'label', job_label) workflow.add_node(cfar_node) # Node has only one output file cfar_out = cfar_node.output_files[0] cfar_outs.append(cfar_out) return cfar_outs