def setup_psd_calculate(workflow, frame_files, ifo, segments, segment_name, out_dir, tags=None): make_analysis_dir(out_dir) tags = [] if not tags else tags if workflow.cp.has_option_tags('workflow-psd', 'parallelization-factor', tags=tags): num_parts = int(workflow.cp.get_opt_tags('workflow-psd', 'parallelization-factor', tags=tags)) else: num_parts = 1 # get rid of duplicate segments which happen when splitting the bank segments = segmentlist(frozenset(segments)) segment_lists = list(chunks(segments, num_parts)) psd_files = FileList([]) for i, segs in enumerate(segment_lists): seg_file = SegFile.from_segment_list('%s_%s' %(segment_name, i), segmentlist(segs), segment_name, ifo, valid_segment=workflow.analysis_time, extension='xml', directory=out_dir) psd_files += [make_psd_file(workflow, frame_files, seg_file, segment_name, out_dir, tags=tags + ['PART%s' % i])] if num_parts > 1: return merge_psds(workflow, psd_files, ifo, out_dir, tags=tags) else: return psd_files[0]
def render_default(path, cp): """ This is the default function that will render a template to a string of HTML. The string will be for a drop-down tab that contains a link to the file. If the file extension requires information to be read, then that is passed to the content variable (eg. a segmentlistdict). """ # define filename and slug from path filename = os.path.basename(path) slug = filename.replace('.', '_') # initializations content = None if path.endswith('.xml') or path.endswith('.xml.gz'): # segment or veto file return a segmentslistdict instance with open(path, 'r') as xmlfile: try: wf_file = SegFile.from_segment_xml(path) # FIXME: This is a dictionary, but the code wants a segmentlist # for now I just coalesce. seg_dict = wf_file.return_union_seglist() except Exception as e: print 'No segment table found in', path, ':', e # render template template_dir = pycbc.results.__path__[0] + '/templates/files' env = Environment(loader=FileSystemLoader(template_dir)) env.globals.update(abs=abs) env.globals.update(open=open) env.globals.update(path_exists=os.path.exists) template = env.get_template('file_default.html') context = { 'path': path, 'filename': filename, 'slug': slug, 'cp': cp, 'content': content } output = template.render(context) return output
def render_default(path, cp): """ This is the default function that will render a template to a string of HTML. The string will be for a drop-down tab that contains a link to the file. If the file extension requires information to be read, then that is passed to the content variable (eg. a segmentlistdict). """ # define filename and slug from path filename = os.path.basename(path) slug = filename.replace('.', '_') # initializations content = None if path.endswith('.xml') or path.endswith('.xml.gz'): # segment or veto file return a segmentslistdict instance with open(path, 'r') as xmlfile: try: wf_file = SegFile.from_segment_xml(path) # FIXME: This is a dictionary, but the code wants a segmentlist # for now I just coalesce. seg_dict = wf_file.return_union_seglist() except Exception as e: print 'No segment table found in', path, ':', e # render template template_dir = pycbc.results.__path__[0] + '/templates/files' env = Environment(loader=FileSystemLoader(template_dir)) env.globals.update(abs=abs) env.globals.update(open=open) env.globals.update(path_exists=os.path.exists) template = env.get_template('file_default.html') context = {'path' : path, 'filename' : filename, 'slug' : slug, 'cp' : cp, 'content' : content} output = template.render(context) return output
def setup_datafind_workflow(workflow, scienceSegs, outputDir, seg_file=None, tags=None): """ Setup datafind section of the workflow. This section is responsible for generating, or setting up the workflow to generate, a list of files that record the location of the frame files needed to perform the analysis. There could be multiple options here, the datafind jobs could be done at run time or could be put into a dag. The subsequent jobs will know what was done here from the OutFileList containing the datafind jobs (and the Dagman nodes if appropriate. For now the only implemented option is to generate the datafind files at runtime. This module can also check if the frameFiles actually exist, check whether the obtained segments line up with the original ones and update the science segments to reflect missing data files. Parameters ---------- workflow: pycbc.workflow.core.Workflow The workflow class that stores the jobs that will be run. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. outputDir : path All output files written by datafind processes will be written to this directory. seg_file : SegFile, optional (default=None) The file returned by get_science_segments containing the science segments and the associated segment_summary. This will be used for the segment_summary test and is required if, and only if, performing that test. tags : list of string, optional (default=None) Use this to specify tags. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! Returns -------- datafindOuts : OutGroupList List of all the datafind output files for use later in the pipeline. sci_avlble_file : SegFile SegFile containing the analysable time after checks in the datafind module are applied to the input segment list. For production runs this is expected to be equal to the input segment list. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. If the updateSegmentTimes kwarg is given this will be updated to reflect any instances of missing data. sci_avlble_name : string The name with which the analysable time is stored in the sci_avlble_file. """ if tags is None: tags = [] logging.info("Entering datafind module") make_analysis_dir(outputDir) cp = workflow.cp # Parse for options in ini file datafindMethod = cp.get_opt_tags("workflow-datafind", "datafind-method", tags) if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", tags): checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", tags) else: checkSegmentGaps = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", tags): checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", tags) else: checkFramesExist = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", tags): checkSegmentSummary = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-summary", tags) else: checkSegmentSummary = "no_test" logging.info("Starting datafind with setup_datafind_runtime_generated") if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs, outputDir, tags=tags) elif datafindMethod == "FROM_PREGENERATED_LCF_FILES": ifos = scienceSegs.keys() datafindcaches, datafindouts = \ setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tags=tags) else: msg = "Entry datafind-method in [workflow-datafind] does not have " msg += "expected value. Valid values are " msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES " msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. " msg += "Consult the documentation for more info." raise ValueError(msg) using_backup_server = False if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or \ datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", tags): using_backup_server = True backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", tags) cp_new = copy.deepcopy(cp) cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server) cp_new.set('datafind', 'urltype', 'gsiftp') backup_datafindcaches, backup_datafindouts =\ setup_datafind_runtime_frames_single_call_perifo(cp_new, scienceSegs, outputDir, tags=tags) backup_datafindouts = datafind_keep_unique_backups(\ backup_datafindouts, datafindouts) datafindcaches.extend(backup_datafindcaches) datafindouts.extend(backup_datafindouts) logging.info("setup_datafind_runtime_generated completed") # If we don't have frame files covering all times we can update the science # segments. if checkSegmentGaps in ['warn','update_times','raise_error']: logging.info("Checking science segments against datafind output....") newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) logging.info("New segments calculated from data find output.....") missingData = False for ifo in scienceSegs.keys(): # If no science segments in input then do nothing if not scienceSegs[ifo]: msg = "No science segments are present for ifo %s, " %(ifo) msg += "the segment metadata indicates there is no analyzable" msg += " strain data between the selected GPS start and end " msg += "times." logging.warning(msg) continue if not newScienceSegs.has_key(ifo): msg = "No data frames were found corresponding to the science " msg += "segments for ifo %s" %(ifo) logging.error(msg) missingData = True if checkSegmentGaps == 'update_times': scienceSegs[ifo] = segments.segmentlist() continue missing = scienceSegs[ifo] - newScienceSegs[ifo] if abs(missing): msg = "From ifo %s we are missing frames covering:" %(ifo) msg += "\n%s" % "\n".join(map(str, missing)) missingData = True logging.error(msg) if checkSegmentGaps == 'update_times': # Remove missing time, so that we can carry on if desired logging.info("Updating science segments for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missing if checkSegmentGaps == 'raise_error' and missingData: raise ValueError("Workflow cannot find needed data, exiting.") logging.info("Done checking, any discrepancies are reported above.") elif checkSegmentGaps == 'no_test': pass else: errMsg = "checkSegmentGaps kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Do all of the frame files that were returned actually exist? if checkFramesExist in ['warn','update_times','raise_error']: logging.info("Verifying that all frames exist on disk.") missingFrSegs, missingFrames = \ get_missing_segs_from_frame_file_cache(datafindcaches) missingFlag = False for ifo in missingFrames.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: continue # If using a backup server, does the frame exist remotely? if using_backup_server: # WARNING: This will be slow, but hopefully it will not occur # for too many frames. This could be optimized if # it becomes necessary. new_list = [] for frame in missingFrames[ifo]: for dfout in datafindouts: dfout_pfns = list(dfout.pfns) dfout_urls = [a.url for a in dfout_pfns] if frame.url in dfout_urls: pfn = dfout_pfns[dfout_urls.index(frame.url)] dfout.removePFN(pfn) if len(dfout.pfns) == 0: new_list.append(frame) else: msg = "Frame %s not found locally. "\ %(frame.url,) msg += "Replacing with remote url(s) %s." \ %(str([a.url for a in dfout.pfns]),) logging.info(msg) break else: new_list.append(frame) missingFrames[ifo] = new_list if missingFrames[ifo]: msg = "From ifo %s we are missing the following frames:" %(ifo) msg +='\n'.join([a.url for a in missingFrames[ifo]]) missingFlag = True logging.error(msg) if checkFramesExist == 'update_times': # Remove missing times, so that we can carry on if desired logging.info("Updating science times for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo] if checkFramesExist == 'raise_error' and missingFlag: raise ValueError("Workflow cannot find all frames, exiting.") logging.info("Finished checking frames.") elif checkFramesExist == 'no_test': pass else: errMsg = "checkFramesExist kwarg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Check if there are cases where frames exist, but no entry in the segment # summary table are present. if checkSegmentSummary in ['warn', 'raise_error']: logging.info("Checking the segment summary table against frames.") dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) missingFlag = False # NOTE: Should this be overrideable in the config file? sci_seg_name = "SCIENCE" if seg_file is None: err_msg = "You must provide the science segments SegFile object " err_msg += "if using the datafind-check-segment-summary option." raise ValueError(err_msg) if seg_file.seg_summ_dict is None: err_msg = "The provided science segments SegFile object must " err_msg += "contain a valid segment_summary table if using the " err_msg += "datafind-check-segment-summary option." raise ValueError(err_msg) seg_summary_times = seg_file.seg_summ_dict for ifo in dfScienceSegs.keys(): curr_seg_summ_times = seg_summary_times[ifo + ":" + sci_seg_name] missing = (dfScienceSegs[ifo] & seg_file.valid_segments) missing.coalesce() missing = missing - curr_seg_summ_times missing.coalesce() scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo] scienceButNotFrame.coalesce() missing2 = scienceSegs[ifo] - scienceButNotFrame missing2.coalesce() missing2 = missing2 - curr_seg_summ_times missing2.coalesce() if abs(missing): msg = "From ifo %s the following times have frames, " %(ifo) msg += "but are not covered in the segment summary table." msg += "\n%s" % "\n".join(map(str, missing)) logging.error(msg) missingFlag = True if abs(missing2): msg = "From ifo %s the following times have frames, " %(ifo) msg += "are science, and are not covered in the segment " msg += "summary table." msg += "\n%s" % "\n".join(map(str, missing2)) logging.error(msg) missingFlag = True if checkSegmentSummary == 'raise_error' and missingFlag: errMsg = "Segment_summary discrepancy detected, exiting." raise ValueError(errMsg) elif checkSegmentSummary == 'no_test': pass else: errMsg = "checkSegmentSummary kwarg must take a value from 'no_test', " errMsg += "'warn', or 'raise_error'." raise ValueError(errMsg) # Now need to create the file for SCIENCE_AVAILABLE sci_avlble_dict = segments.segmentlistdict() # NOTE: Should this be overrideable in the config file? sci_avlble_name = "SCIENCE_AVAILABLE" for ifo in scienceSegs.keys(): sci_avlble_dict[ifo + ':' + sci_avlble_name] = scienceSegs[ifo] sci_avlble_file = SegFile.from_segment_list_dict('SCIENCE_AVAILABLE', sci_avlble_dict, ifo_list = scienceSegs.keys(), valid_segment=workflow.analysis_time, extension='.xml', tags=tags, directory=outputDir) logging.info("Leaving datafind module") return FileList(datafindouts), sci_avlble_file, scienceSegs, sci_avlble_name
def get_segments_file(workflow, name, option_name, out_dir): """Get cumulative segments from option name syntax for each ifo. Use syntax of configparser string to define the resulting segment_file e.x. option_name = +up_flag1,+up_flag2,+up_flag3,-down_flag1,-down_flag2 Each ifo may have a different string and is stored separately in the file. Flags which add time must precede flags which subtract time. Parameters ---------- workflow: pycbc.workflow.Workflow name: string Name of the segment list being created option_name: str Name of option in the associated config parser to get the flag list returns -------- seg_file: pycbc.workflow.SegFile SegFile intance that points to the segment xml file on disk. """ from pycbc.dq import query_str make_analysis_dir(out_dir) cp = workflow.cp start = workflow.analysis_time[0] end = workflow.analysis_time[1] # Check for veto definer file veto_definer = None if cp.has_option("workflow-segments", "segments-veto-definer-url"): veto_definer = save_veto_definer(workflow.cp, out_dir, []) # Check for provided server server = "https://segments.ligo.org" if cp.has_option("workflow-segments", "segments-database-url"): server = cp.get("workflow-segments", "segments-database-url") source = "any" if cp.has_option("workflow-segments", "segments-source"): source = cp.get("workflow-segments", "segments-source") if source == "file": local_file_path = \ resolve_url(cp.get("workflow-segments", option_name+"-file")) pfn = os.path.join(out_dir, os.path.basename(local_file_path)) shutil.move(local_file_path, pfn) return SegFile.from_segment_xml(pfn) segs = {} for ifo in workflow.ifos: flag_str = cp.get_opt_tags("workflow-segments", option_name, [ifo]) key = ifo + ':' + name segs[key] = query_str(ifo, flag_str, start, end, source=source, server=server, veto_definer=veto_definer) logging.info("%s: got %s flags", ifo, option_name) return SegFile.from_segment_list_dict(name, segs, extension='.xml', valid_segment=workflow.analysis_time, directory=out_dir)