def setup_datafind_workflow(workflow, scienceSegs, outputDir, segFilesList, tag=None): """ Setup datafind section of the workflow. This section is responsible for generating, or setting up the workflow to generate, a list of files that record the location of the frame files needed to perform the analysis. There could be multiple options here, the datafind jobs could be done at run time or could be put into a dag. The subsequent jobs will know what was done here from the OutFileList containing the datafind jobs (and the Dagman nodes if appropriate. For now the only implemented option is to generate the datafind files at runtime. This module can also check if the frameFiles actually exist, check whether the obtained segments line up with the original ones and update the science segments to reflect missing data files. Parameters ---------- workflow: pycbc.workflow.core.Workflow The workflow class that stores the jobs that will be run. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. outputDir : path All output files written by datafind processes will be written to this directory. segFilesList : List of the files returned by segment_utils This contains representations of the various segment files that were constructed at the segment generation stage of the workflow. This will be used for the segment_summary test, or if any of the other tests are given "update_times" (and can be given a value of None otherwise). tag : string, optional (default=None) Use this to specify a tag. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! Returns -------- datafindOuts : OutGroupList List of all the datafind output files for use later in the pipeline. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. If the updateSegmentTimes kwarg is given this will be updated to reflect any instances of missing data. """ logging.info("Entering datafind module") make_analysis_dir(outputDir) cp = workflow.cp # Parse for options in ini file datafindMethod = cp.get_opt_tags("workflow-datafind", "datafind-method", [tag]) if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", [tag]): checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", [tag]) else: checkSegmentGaps = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", [tag]): checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", [tag]) else: checkFramesExist = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", [tag]): checkSegmentSummary = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-summary", [tag]) else: checkSegmentSummary = "no_test" logging.info("Starting datafind with setup_datafind_runtime_generated") if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES": datafindcaches, datafindouts = setup_datafind_runtime_cache_multi_calls_perifo( cp, scienceSegs, outputDir, tag=tag ) elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES": datafindcaches, datafindouts = setup_datafind_runtime_cache_single_call_perifo( cp, scienceSegs, outputDir, tag=tag ) elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES": datafindcaches, datafindouts = setup_datafind_runtime_frames_multi_calls_perifo( cp, scienceSegs, outputDir, tag=tag ) elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": datafindcaches, datafindouts = setup_datafind_runtime_frames_single_call_perifo( cp, scienceSegs, outputDir, tag=tag ) elif datafindMethod == "FROM_PREGENERATED_LCF_FILES": ifos = scienceSegs.keys() datafindcaches, datafindouts = setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tag=tag) else: msg = "Entry datafind-method in [workflow-datafind] does not have " msg += "expected value. Valid values are " msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES " msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. " msg += "Consult the documentation for more info." raise ValueError(msg) using_backup_server = False if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", [tag]): using_backup_server = True backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", [tag]) cp_new = copy.deepcopy(cp) cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server) cp_new.set("datafind", "urltype", "gsiftp") backup_datafindcaches, backup_datafindouts = setup_datafind_runtime_frames_single_call_perifo( cp_new, scienceSegs, outputDir, tag=tag ) backup_datafindouts = datafind_keep_unique_backups(backup_datafindouts, datafindouts) datafindcaches.extend(backup_datafindcaches) datafindouts.extend(backup_datafindouts) logging.info("setup_datafind_runtime_generated completed") # If we don't have frame files covering all times we can update the science # segments. if checkSegmentGaps in ["warn", "update_times", "raise_error"]: logging.info("Checking science segments against datafind output....") newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) logging.info("Datafind segments calculated.....") missingData = False msg = "Any errors directly following this message refer to times that" msg += " the segment server says are science, but datafind cannot find" msg += "frames for:" logging.info(msg) for ifo in scienceSegs.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: msg = "No input science segments for ifo %s " % (ifo) msg += "so, surprisingly, no data has been found. " msg += "Was this expected?" logging.warning(msg) continue if not newScienceSegs.has_key(ifo): msg = "IFO %s's science segments " % (ifo) msg += "are completely missing." logging.error(msg) missingData = True if checkSegmentGaps == "update_times": scienceSegs[ifo] = segments.segmentlist() continue missing = scienceSegs[ifo] - newScienceSegs[ifo] if abs(missing): msg = "From ifo %s we are missing frames covering:" % (ifo) msg += "\n%s" % "\n".join(map(str, missing)) missingData = True logging.error(msg) if checkSegmentGaps == "update_times": # Remove missing time, so that we can carry on if desired logging.info("Updating science times for ifo %s." % (ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missing if checkSegmentGaps == "raise_error" and missingData: raise ValueError("Workflow cannot find needed data, exiting.") logging.info("Done checking, any discrepancies are reported above.") elif checkSegmentGaps == "no_test": # Do nothing pass else: errMsg = "checkSegmentGaps kwArg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Do all of the frame files that were returned actually exist? if checkFramesExist in ["warn", "update_times", "raise_error"]: logging.info("Verifying that all frames exist on disk.") missingFrSegs, missingFrames = get_missing_segs_from_frame_file_cache(datafindcaches) missingFlag = False for ifo in missingFrames.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: continue # If using a backup server, does the frame exist remotely? if using_backup_server: # WARNING: This will be slow, but hopefully it will not occur # for too many frames. This could be optimized if # it becomes necessary. new_list = [] for frame in missingFrames[ifo]: for dfout in datafindouts: dfout_pfns = list(dfout.pfns) dfout_urls = [a.url for a in dfout_pfns] if frame.url in dfout_urls: pfn = dfout_pfns[dfout_urls.index(frame.url)] dfout.removePFN(pfn) if len(dfout.pfns) == 0: new_list.append(frame) else: msg = "Frame %s not found locally. " % (frame.url,) msg += "Replacing with remote url(s) " msg += "%s." % (str([a.url for a in dfout.pfns]),) logging.info(msg) break else: new_list.append(frame) missingFrames[ifo] = new_list if missingFrames[ifo]: msg = "From ifo %s we are missing the following frames:" % (ifo) msg += "\n".join([a.url for a in missingFrames[ifo]]) missingFlag = True logging.error(msg) if checkFramesExist == "update_times": # Remove missing times, so that we can carry on if desired logging.info("Updating science times for ifo %s." % (ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo] if checkFramesExist == "raise_error" and missingFlag: raise ValueError("Workflow cannot find all frames, exiting.") logging.info("Finished checking frames.") elif checkFramesExist == "no_test": # Do nothing pass else: errMsg = "checkFramesExist kwArg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Check if there are cases where frames exist, but no entry in the segment # summary table are present. if checkSegmentSummary in ["warn", "raise_error"]: logging.info("Checking the segment summary table against frames.") dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) missingFlag = False for ifo in dfScienceSegs.keys(): scienceFile = segFilesList.find_output_with_ifo(ifo) scienceFile = scienceFile.find_output_with_tag("SCIENCE") if not len(scienceFile) == 1: errMsg = "Did not find exactly 1 science file." raise ValueError(errMsg) scienceFile = scienceFile[0] scienceChannel = cp.get("workflow-segments", "segments-%s-science-name" % (ifo.lower())) segSummaryTimes = get_segment_summary_times(scienceFile, scienceChannel) missing = dfScienceSegs[ifo] - segSummaryTimes scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo] missing2 = scienceSegs[ifo] - scienceButNotFrame missing2 = missing2 - segSummaryTimes if abs(missing): msg = "From ifo %s the following times have frames, " % (ifo) msg += "but are not covered in the segment summary table." msg += "\n%s" % "\n".join(map(str, missing)) logging.error(msg) missingFlag = True if abs(missing2): msg = "From ifo %s the following times have frames, " % (ifo) msg += "are science, and are not covered in the segment " msg += "summary table." msg += "\n%s" % "\n".join(map(str, missing2)) logging.error(msg) missingFlag = True if checkSegmentSummary == "raise_error" and missingFlag: errMsg = "Segment_summary discrepancy detected, exiting." raise ValueError(errMsg) elif checkSegmentSummary == "no_test": # Do nothing pass else: errMsg = "checkSegmentSummary kwArg must take a value from 'no_test', " errMsg += "'warn', or 'raise_error'." raise ValueError(errMsg) # Now need to create the file for SCIENCE_AVAILABLE for ifo in scienceSegs.keys(): availableSegsFile = os.path.abspath( os.path.join(outputDir, "%s-SCIENCE_AVAILABLE_SEGMENTS.xml" % (ifo.upper())) ) currUrl = urlparse.urlunparse(["file", "localhost", availableSegsFile, None, None, None]) if tag: currTags = [tag, "SCIENCE_AVAILABLE"] else: currTags = ["SCIENCE_AVAILABLE"] currFile = OutSegFile( ifo, "SEGMENTS", workflow.analysis_time, currUrl, segment_list=scienceSegs[ifo], tags=currTags ) currFile.PFN(availableSegsFile, site="local") segFilesList.append(currFile) currFile.toSegmentXml() logging.info("Leaving datafind module") return FileList(datafindouts), scienceSegs
def setup_segment_gen_mixed(workflow, veto_categories, out_dir, maxVetoAtRunTime, tag=None, generate_coincident_segs=True): """ This function will generate veto files for each ifo and for each veto category. It can generate these vetoes at run-time or in the workflow (or do some at run-time and some in the workflow). However, the CAT_1 vetoes and science time must be generated at run time as they are needed to plan the workflow. CATs 2 and higher *may* be needed for other workflow construction. It can also combine these files to create a set of cumulative, multi-detector veto files, which can be used in ligolw_thinca and in pipedown. Again these can be created at run time or within the workflow. Parameters ----------- workflow : pycbc.workflow.core.Workflow The Workflow instance that the coincidence jobs will be added to. This instance also contains the ifos for which to attempt to obtain segments for this analysis and the start and end times to search for segments over. veto_categories : list of ints List of veto categories to generate segments for. If this stops being integers, this can be changed here. out_dir : path The directory in which output will be stored. maxVetoAtRunTime : int Generate veto files at run time up to this category. Veto categories beyond this in veto_categories will be generated in the workflow. If we move to a model where veto categories are not explicitly cumulative, this will be rethought. tag : string, optional (default=None) Use this to specify a tag. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! generate_coincident_segs : boolean, optional (default = True) If given this module will generate a set of coincident, cumulative veto files that can be used with ligolw_thinca and pipedown. Returns ------- segFilesList : dictionary of pycbc.workflow.core.SegFile instances These are representations of the various segment files that were constructed at this stage of the workflow and may be needed at later stages of the analysis (e.g. for performing DQ vetoes). If the file was generated at run-time the segment lists contained within these files will be an attribute of the instance. (If it will be generated in the workflow it will not be because I am not psychic). """ cp = workflow.cp segFilesList = FileList([]) start_time = workflow.analysis_time[0] end_time = workflow.analysis_time[1] segValidSeg = workflow.analysis_time # Will I need to add some jobs to the workflow? vetoGenJob = create_segs_from_cats_job(cp, out_dir, workflow.ifo_string) for ifo in workflow.ifos: logging.info("Generating science segments for ifo %s" %(ifo)) currSciSegs, currSciXmlFile = get_science_segments(ifo, cp, start_time, end_time, out_dir, tag=tag) segFilesList.append(currSciXmlFile) for category in veto_categories: if category > maxVetoAtRunTime: msg = "Adding creation of CAT_%d segments " %(category) msg += "for ifo %s to workflow." %(ifo) logging.info(msg) execute_status = False if category <= maxVetoAtRunTime: logging.info("Generating CAT_%d segments for ifo %s." \ %(category,ifo)) execute_status = True currVetoXmlFile = get_veto_segs(workflow, ifo, category, start_time, end_time, out_dir, vetoGenJob, execute_now=execute_status) segFilesList.append(currVetoXmlFile) # Store the CAT_1 veto segs for use below if category == 1: # Yes its yucky to generate a file and then read it back in. #This will be # fixed when the new API for segment generation is ready. vetoXmlFP = open(currVetoXmlFile.storage_path, 'r') cat1Segs = fromsegmentxml(vetoXmlFP) vetoXmlFP.close() analysedSegs = currSciSegs - cat1Segs analysedSegs.coalesce() analysedXmlFile = os.path.join(out_dir, "%s-SCIENCE_OK_SEGMENTS.xml" %(ifo.upper()) ) currUrl = urlparse.urlunparse(['file', 'localhost', analysedXmlFile, None, None, None]) if tag: currTags = [tag, 'SCIENCE_OK'] else: currTags = ['SCIENCE_OK'] currFile = OutSegFile(ifo, 'SEGMENTS', segValidSeg, currUrl, segment_list=analysedSegs, tags = currTags) segFilesList.append(currFile) currFile.toSegmentXml() if generate_coincident_segs: # Need to make some combined category veto files to use when vetoing # segments and triggers. ifo_string = workflow.ifo_string categories = [] cum_cat_files = [] for category in veto_categories: categories.append(category) # Set file name in workflow standard if tag: currTags = [tag, 'CUMULATIVE_CAT_%d' %(category)] else: currTags = ['CUMULATIVE_CAT_%d' %(category)] cumulativeVetoFile = os.path.join(out_dir, '%s-CUMULATIVE_CAT_%d_VETO_SEGMENTS.xml' \ %(ifo_string, category) ) currUrl = urlparse.urlunparse(['file', 'localhost', cumulativeVetoFile, None, None, None]) currSegFile = OutSegFile(ifo_string, 'SEGMENTS', segValidSeg, currUrl, tags=currTags) # And actually make the file (or queue it in the workflow) logging.info("Generating combined, cumulative CAT_%d segments."\ %(category)) if category <= maxVetoAtRunTime: execute_status = True else: execute_status = False get_cumulative_segs(workflow, currSegFile, categories, segFilesList, out_dir, execute_now=execute_status) segFilesList.append(currSegFile) cum_cat_files.append(currSegFile) # Create a combined file # Set file tag in workflow standard if tag: currTags = [tag, 'COMBINED_CUMULATIVE_SEGMENTS'] else: currTags = ['COMBINED_CUMULATIVE_SEGMENTS'] combined_veto_file = os.path.join(out_dir, '%s-CUMULATIVE_ALL_CATS_SEGMENTS.xml' \ %(ifo_string) ) curr_url = urlparse.urlunparse(['file', 'localhost', combined_veto_file, None, None, None]) curr_file = OutSegFile(ifo_string, 'SEGMENTS', segValidSeg, curr_url, tags=currTags) for category in veto_categories: if category <= maxVetoAtRunTime: execute_status = True break else: execute_status = False add_cumulative_files(workflow, curr_file, cum_cat_files, out_dir, execute_now=execute_status) segFilesList.append(curr_file) return segFilesList
def get_triggered_coherent_segment(workflow, out_dir, sciencesegs, tag=None): """ Construct the coherent network on and off source segments. Parameters ----------- workflow : pycbc.workflow.core.Workflow The workflow instance that the coincidence jobs will be added to. This instance also contains the ifos for which to attempt to obtain segments for this analysis and the start and end times to search for segments over. out_dir : path The directory in which output will be stored. sciencesegs : dictionary Dictionary of science segments produced by ahope.setup_segment_generation() tag : string, optional (default=None) Use this to specify a tag. Returns -------- onsource : glue.segments.segmentlistdict A dictionary containing the on source segments for network IFOs offsource : glue.segments.segmentlistdict A dictionary containing the off source segments for network IFOs """ logging.info("Calculating optimal coherent segment.") # Load parsed workflow config options cp = workflow.cp ra = float(os.path.basename(cp.get('workflow', 'ra'))) dec = float(os.path.basename(cp.get('workflow', 'dec'))) triggertime = int(os.path.basename(cp.get('workflow', 'trigger-time'))) minbefore = int(os.path.basename(cp.get('workflow-exttrig_segments', 'min-before'))) minafter = int(os.path.basename(cp.get('workflow-exttrig_segments', 'min-after'))) minduration = int(os.path.basename(cp.get('workflow-exttrig_segments', 'min-duration'))) maxduration = int(os.path.basename(cp.get('workflow-exttrig_segments', 'max-duration'))) onbefore = int(os.path.basename(cp.get('workflow-exttrig_segments', 'on-before'))) onafter = int(os.path.basename(cp.get('workflow-exttrig_segments', 'on-after'))) padding = int(os.path.basename(cp.get('workflow-exttrig_segments', 'pad-data'))) quanta = int(os.path.basename(cp.get('workflow-exttrig_segments', 'quanta'))) # Check available data segments meet criteria specified in arguments sciencesegs = segments.segmentlistdict(sciencesegs) sciencesegs = sciencesegs.extract_common(sciencesegs.keys()) if triggertime not in sciencesegs[sciencesegs.keys()[0]]: logging.error("Trigger is not contained within any available segment." " Exiting.") sys.exit() offsrclist = sciencesegs[sciencesegs.keys()[0]] if len(offsrclist) > 1: logging.info("Removing network segments that do not contain trigger " "time") for seg in offsrclist: if triggertime in seg: offsrc = seg else: offsrc = offsrclist[0] if (triggertime - minbefore - padding not in offsrc) or ( triggertime + minafter + padding not in offsrc): logging.error("Not enough data either side of trigger time. Exiting.") sys.exit() if abs(offsrc) < minduration + 2 * padding: logging.error("Available network segment shorter than minimum allowed " "duration. Exiting.") sys.exit() # Will segment duration be the maximum desired length or not? if abs(offsrc) >= maxduration + 2 * padding: logging.info("Available network science segment duration (%ds) is " "greater than the maximum allowed segment length (%ds). " "Truncating..." % (abs(offsrc), maxduration)) else: logging.info("Available network science segment duration (%ds) is " "less than the maximum allowed segment length (%ds)." % (abs(offsrc), maxduration)) logging.info("%ds of padding applied at beginning and end of segment." % padding) # Maximal, centred coherent network segment idealsegment = segments.segment(int(triggertime - padding - 0.5 * maxduration), int(triggertime + padding + 0.5 * maxduration)) # Construct off-source if (idealsegment in offsrc): offsrc = idealsegment elif idealsegment[1] not in offsrc: offsrc &= segments.segment(offsrc[1] - maxduration - 2 * padding, offsrc[1]) elif idealsegment[0] not in offsrc: offsrc &= segments.segment(offsrc[0], offsrc[0] + maxduration + 2 * padding) # Trimming off-source excess = abs(offsrc) % quanta - 2 * padding if excess != 0: logging.info("Trimming %ds excess time to make OFF-SOURCE duration a " "multiple of %ds" % (excess, quanta)) offset = (offsrc[0] + abs(offsrc) / 2.) - triggertime if 2 * abs(offset) > excess: if offset < 0: offsrc &= segments.segment(offsrc[0] + excess, offsrc[1]) elif offset > 0: offsrc &= segments.segment(offsrc[0], offsrc[1] - excess) assert abs(offsrc) % quanta == 2 * padding else: logging.info("This will make OFF-SOURCE symmetrical about trigger " "time.") offsrc = segments.segment(offsrc[0] - offset + excess / 2, offsrc[1] - offset - excess / 2) assert abs(offsrc) % quanta == 2 * padding logging.info("Constructed OFF-SOURCE: duration %ds (%ds before to %ds " "after trigger)." % (abs(offsrc) - 2 * padding, triggertime - offsrc[0] - padding, offsrc[1] - triggertime - padding)) offsrc = segments.segmentlist([offsrc]) # Construct on-source onsrc = segments.segment(triggertime - onbefore, triggertime + onafter) logging.info("Constructed ON-SOURCE: duration %ds (%ds before to %ds after" " trigger)." % (abs(onsrc), triggertime - onsrc[0], onsrc[1] - triggertime)) onsrc = segments.segmentlist([onsrc]) # Put segments into segmentlistdicts onsource = offsource = segments.segmentlistdict() ifos = '' for iifo in sciencesegs.keys(): ifos += str(iifo) onsource[iifo] = onsrc offsource[iifo] = offsrc # Write off-source to xml file XmlFile = os.path.join(out_dir, "%s-COH_OFFSOURCE_SEGMENT.xml" % ifos.upper()) currUrl = urlparse.urlunparse(['file', 'localhost', XmlFile, None, None, None]) currFile = OutSegFile(ifos, 'COH-OFFSOURCE', offsource[iifo], currUrl, offsource[iifo]) currFile.toSegmentXml() logging.info("Optimal coherent segment calculated.") return onsource, offsource
def setup_datafind_workflow(workflow, scienceSegs, outputDir, segFilesList, tag=None): """ Setup datafind section of the workflow. This section is responsible for generating, or setting up the workflow to generate, a list of files that record the location of the frame files needed to perform the analysis. There could be multiple options here, the datafind jobs could be done at run time or could be put into a dag. The subsequent jobs will know what was done here from the OutFileList containing the datafind jobs (and the Dagman nodes if appropriate. For now the only implemented option is to generate the datafind files at runtime. This module can also check if the frameFiles actually exist, check whether the obtained segments line up with the original ones and update the science segments to reflect missing data files. Parameters ---------- workflow: pycbc.workflow.core.Workflow The workflow class that stores the jobs that will be run. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. outputDir : path All output files written by datafind processes will be written to this directory. segFilesList : List of the files returned by segment_utils This contains representations of the various segment files that were constructed at the segment generation stage of the workflow. This will be used for the segment_summary test, or if any of the other tests are given "update_times" (and can be given a value of None otherwise). tag : string, optional (default=None) Use this to specify a tag. This can be used if this module is being called more than once to give call specific configuration (by setting options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This is also used to tag the Files returned by the class to uniqueify the Files and uniqueify the actual filename. FIXME: Filenames may not be unique with current codes! Returns -------- datafindOuts : OutGroupList List of all the datafind output files for use later in the pipeline. scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances This contains the times that the workflow is expected to analyse. If the updateSegmentTimes kwarg is given this will be updated to reflect any instances of missing data. """ logging.info("Entering datafind module") make_analysis_dir(outputDir) cp = workflow.cp # Parse for options in ini file datafindMethod = cp.get_opt_tags("workflow-datafind", "datafind-method", [tag]) if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", [tag]): checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", [tag]) else: checkSegmentGaps = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", [tag]): checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", [tag]) else: checkFramesExist = "no_test" if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", [tag]): checkSegmentSummary = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-summary", [tag]) else: checkSegmentSummary = "no_test" logging.info("Starting datafind with setup_datafind_runtime_generated") if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs, outputDir, tag=tag) elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES": datafindcaches, datafindouts = \ setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs, outputDir, tag=tag) elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs, outputDir, tag=tag) elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": datafindcaches, datafindouts = \ setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs, outputDir, tag=tag) elif datafindMethod == "FROM_PREGENERATED_LCF_FILES": ifos = scienceSegs.keys() datafindcaches, datafindouts = \ setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tag=tag) else: msg = "Entry datafind-method in [workflow-datafind] does not have " msg += "expected value. Valid values are " msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES " msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. " msg += "Consult the documentation for more info." raise ValueError(msg) using_backup_server = False if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or \ datafindMethod == "AT_RUNTIME_SINGLE_FRAMES": if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", [tag]): using_backup_server = True backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", [tag]) cp_new = copy.deepcopy(cp) cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server) cp_new.set('datafind', 'urltype', 'gsiftp') backup_datafindcaches, backup_datafindouts =\ setup_datafind_runtime_frames_single_call_perifo(cp_new, scienceSegs, outputDir, tag=tag) backup_datafindouts = datafind_keep_unique_backups(\ backup_datafindouts, datafindouts) datafindcaches.extend(backup_datafindcaches) datafindouts.extend(backup_datafindouts) logging.info("setup_datafind_runtime_generated completed") # If we don't have frame files covering all times we can update the science # segments. if checkSegmentGaps in ['warn','update_times','raise_error']: logging.info("Checking science segments against datafind output....") newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) logging.info("Datafind segments calculated.....") missingData = False msg = "Any errors directly following this message refer to times that" msg += " the segment server says are science, but datafind cannot find" msg += "frames for:" logging.info(msg) for ifo in scienceSegs.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: msg = "No input science segments for ifo %s " %(ifo) msg += "so, surprisingly, no data has been found. " msg += "Was this expected?" logging.warning(msg) continue if not newScienceSegs.has_key(ifo): msg = "IFO %s's science segments " %(ifo) msg += "are completely missing." logging.error(msg) missingData = True if checkSegmentGaps == 'update_times': scienceSegs[ifo] = segments.segmentlist() continue missing = scienceSegs[ifo] - newScienceSegs[ifo] if abs(missing): msg = "From ifo %s we are missing frames covering:" %(ifo) msg += "\n%s" % "\n".join(map(str, missing)) missingData = True logging.error(msg) if checkSegmentGaps == 'update_times': # Remove missing time, so that we can carry on if desired logging.info("Updating science times for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missing if checkSegmentGaps == 'raise_error' and missingData: raise ValueError("Workflow cannot find needed data, exiting.") logging.info("Done checking, any discrepancies are reported above.") elif checkSegmentGaps == 'no_test': # Do nothing pass else: errMsg = "checkSegmentGaps kwArg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Do all of the frame files that were returned actually exist? if checkFramesExist in ['warn','update_times','raise_error']: logging.info("Verifying that all frames exist on disk.") missingFrSegs, missingFrames = \ get_missing_segs_from_frame_file_cache(datafindcaches) missingFlag = False for ifo in missingFrames.keys(): # If no data in the input then do nothing if not scienceSegs[ifo]: continue # If using a backup server, does the frame exist remotely? if using_backup_server: # WARNING: This will be slow, but hopefully it will not occur # for too many frames. This could be optimized if # it becomes necessary. new_list = [] for frame in missingFrames[ifo]: for dfout in datafindouts: dfout_pfns = list(dfout.pfns) dfout_urls = [a.url for a in dfout_pfns] if frame.url in dfout_urls: pfn = dfout_pfns[dfout_urls.index(frame.url)] dfout.removePFN(pfn) if len(dfout.pfns) == 0: new_list.append(frame) else: msg = "Frame %s not found locally. "\ %(frame.url,) msg += "Replacing with remote url(s) " msg += "%s." \ %(str([a.url for a in dfout.pfns]),) logging.info(msg) break else: new_list.append(frame) missingFrames[ifo] = new_list if missingFrames[ifo]: msg = "From ifo %s we are missing the following frames:" %(ifo) msg +='\n'.join([a.url for a in missingFrames[ifo]]) missingFlag = True logging.error(msg) if checkFramesExist == 'update_times': # Remove missing times, so that we can carry on if desired logging.info("Updating science times for ifo %s." %(ifo)) scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo] if checkFramesExist == 'raise_error' and missingFlag: raise ValueError("Workflow cannot find all frames, exiting.") logging.info("Finished checking frames.") elif checkFramesExist == 'no_test': # Do nothing pass else: errMsg = "checkFramesExist kwArg must take a value from 'no_test', " errMsg += "'warn', 'update_times' or 'raise_error'." raise ValueError(errMsg) # Check if there are cases where frames exist, but no entry in the segment # summary table are present. if checkSegmentSummary in ['warn', 'raise_error']: logging.info("Checking the segment summary table against frames.") dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches) missingFlag = False for ifo in dfScienceSegs.keys(): scienceFile = segFilesList.find_output_with_ifo(ifo) scienceFile = scienceFile.find_output_with_tag('SCIENCE') if not len(scienceFile) == 1: errMsg = "Did not find exactly 1 science file." raise ValueError(errMsg) scienceFile = scienceFile[0] scienceChannel = cp.get('workflow-segments',\ 'segments-%s-science-name'%(ifo.lower())) segSummaryTimes = get_segment_summary_times(scienceFile, scienceChannel) missing = dfScienceSegs[ifo] - segSummaryTimes scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo] missing2 = scienceSegs[ifo] - scienceButNotFrame missing2 = missing2 - segSummaryTimes if abs(missing): msg = "From ifo %s the following times have frames, " %(ifo) msg += "but are not covered in the segment summary table." msg += "\n%s" % "\n".join(map(str, missing)) logging.error(msg) missingFlag = True if abs(missing2): msg = "From ifo %s the following times have frames, " %(ifo) msg += "are science, and are not covered in the segment " msg += "summary table." msg += "\n%s" % "\n".join(map(str, missing2)) logging.error(msg) missingFlag = True if checkSegmentSummary == 'raise_error' and missingFlag: errMsg = "Segment_summary discrepancy detected, exiting." raise ValueError(errMsg) elif checkSegmentSummary == 'no_test': # Do nothing pass else: errMsg = "checkSegmentSummary kwArg must take a value from 'no_test', " errMsg += "'warn', or 'raise_error'." raise ValueError(errMsg) # Now need to create the file for SCIENCE_AVAILABLE for ifo in scienceSegs.keys(): availableSegsFile = os.path.abspath(os.path.join(outputDir, "%s-SCIENCE_AVAILABLE_SEGMENTS.xml" %(ifo.upper()) )) currUrl = urlparse.urlunparse(['file', 'localhost', availableSegsFile, None, None, None]) if tag: currTags = [tag, 'SCIENCE_AVAILABLE'] else: currTags = ['SCIENCE_AVAILABLE'] currFile = OutSegFile(ifo, 'SEGMENTS', workflow.analysis_time, currUrl, segment_list=scienceSegs[ifo], tags = currTags) currFile.PFN(availableSegsFile, site='local') segFilesList.append(currFile) currFile.toSegmentXml() logging.info("Leaving datafind module") return FileList(datafindouts), scienceSegs