Beispiel #1
0
def setup_datafind_workflow(workflow, scienceSegs, outputDir, segFilesList, tag=None):
    """
    Setup datafind section of the workflow. This section is responsible for
    generating, or setting up the workflow to generate, a list of files that
    record the location of the frame files needed to perform the analysis.
    There could be multiple options here, the datafind jobs could be done at
    run time or could be put into a dag. The subsequent jobs will know
    what was done here from the OutFileList containing the datafind jobs
    (and the Dagman nodes if appropriate.
    For now the only implemented option is to generate the datafind files at
    runtime. This module can also check if the frameFiles actually exist, check
    whether the obtained segments line up with the original ones and update the
    science segments to reflect missing data files.

    Parameters
    ----------
    workflow: pycbc.workflow.core.Workflow
        The workflow class that stores the jobs that will be run.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse.
    outputDir : path
        All output files written by datafind processes will be written to this
        directory.
    segFilesList : List of the files returned by segment_utils
        This contains representations of the various segment files that were
        constructed at the segment generation stage of the workflow. This will
        be used for the segment_summary test, or if any of the other tests are
        given "update_times" (and can be given a value of None otherwise).
    tag : string, optional (default=None)
        Use this to specify a tag. This can be used if this module is being
        called more than once to give call specific configuration (by setting
        options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). 
        This is also used to tag the Files returned by the class to uniqueify
        the Files and uniqueify the actual filename.
        FIXME: Filenames may not be unique with current codes!

    Returns
    --------
    datafindOuts : OutGroupList
        List of all the datafind output files for use later in the pipeline.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse. If 
        the updateSegmentTimes kwarg is given this will be updated to reflect 
        any instances of missing data.
    """
    logging.info("Entering datafind module")
    make_analysis_dir(outputDir)
    cp = workflow.cp

    # Parse for options in ini file
    datafindMethod = cp.get_opt_tags("workflow-datafind", "datafind-method", [tag])

    if cp.has_option_tags("workflow-datafind", "datafind-check-segment-gaps", [tag]):
        checkSegmentGaps = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-gaps", [tag])
    else:
        checkSegmentGaps = "no_test"
    if cp.has_option_tags("workflow-datafind", "datafind-check-frames-exist", [tag]):
        checkFramesExist = cp.get_opt_tags("workflow-datafind", "datafind-check-frames-exist", [tag])
    else:
        checkFramesExist = "no_test"
    if cp.has_option_tags("workflow-datafind", "datafind-check-segment-summary", [tag]):
        checkSegmentSummary = cp.get_opt_tags("workflow-datafind", "datafind-check-segment-summary", [tag])
    else:
        checkSegmentSummary = "no_test"

    logging.info("Starting datafind with setup_datafind_runtime_generated")
    if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES":
        datafindcaches, datafindouts = setup_datafind_runtime_cache_multi_calls_perifo(
            cp, scienceSegs, outputDir, tag=tag
        )
    elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES":
        datafindcaches, datafindouts = setup_datafind_runtime_cache_single_call_perifo(
            cp, scienceSegs, outputDir, tag=tag
        )
    elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES":
        datafindcaches, datafindouts = setup_datafind_runtime_frames_multi_calls_perifo(
            cp, scienceSegs, outputDir, tag=tag
        )
    elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        datafindcaches, datafindouts = setup_datafind_runtime_frames_single_call_perifo(
            cp, scienceSegs, outputDir, tag=tag
        )

    elif datafindMethod == "FROM_PREGENERATED_LCF_FILES":
        ifos = scienceSegs.keys()
        datafindcaches, datafindouts = setup_datafind_from_pregenerated_lcf_files(cp, ifos, outputDir, tag=tag)
    else:
        msg = "Entry datafind-method in [workflow-datafind] does not have "
        msg += "expected value. Valid values are "
        msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES "
        msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. "
        msg += "Consult the documentation for more info."
        raise ValueError(msg)

    using_backup_server = False
    if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        if cp.has_option_tags("workflow-datafind", "datafind-backup-datafind-server", [tag]):
            using_backup_server = True
            backup_server = cp.get_opt_tags("workflow-datafind", "datafind-backup-datafind-server", [tag])
            cp_new = copy.deepcopy(cp)
            cp_new.set("workflow-datafind", "datafind-ligo-datafind-server", backup_server)
            cp_new.set("datafind", "urltype", "gsiftp")
            backup_datafindcaches, backup_datafindouts = setup_datafind_runtime_frames_single_call_perifo(
                cp_new, scienceSegs, outputDir, tag=tag
            )
            backup_datafindouts = datafind_keep_unique_backups(backup_datafindouts, datafindouts)
            datafindcaches.extend(backup_datafindcaches)
            datafindouts.extend(backup_datafindouts)

    logging.info("setup_datafind_runtime_generated completed")
    # If we don't have frame files covering all times we can update the science
    # segments.
    if checkSegmentGaps in ["warn", "update_times", "raise_error"]:
        logging.info("Checking science segments against datafind output....")
        newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        logging.info("Datafind segments calculated.....")
        missingData = False
        msg = "Any errors directly following this message refer to times that"
        msg += " the segment server says are science, but datafind cannot find"
        msg += "frames for:"
        logging.info(msg)
        for ifo in scienceSegs.keys():
            # If no data in the input then do nothing
            if not scienceSegs[ifo]:
                msg = "No input science segments for ifo %s " % (ifo)
                msg += "so, surprisingly, no data has been found. "
                msg += "Was this expected?"
                logging.warning(msg)
                continue
            if not newScienceSegs.has_key(ifo):
                msg = "IFO %s's science segments " % (ifo)
                msg += "are completely missing."
                logging.error(msg)
                missingData = True
                if checkSegmentGaps == "update_times":
                    scienceSegs[ifo] = segments.segmentlist()
                continue
            missing = scienceSegs[ifo] - newScienceSegs[ifo]
            if abs(missing):
                msg = "From ifo %s we are missing frames covering:" % (ifo)
                msg += "\n%s" % "\n".join(map(str, missing))
                missingData = True
                logging.error(msg)
                if checkSegmentGaps == "update_times":
                    # Remove missing time, so that we can carry on if desired
                    logging.info("Updating science times for ifo %s." % (ifo))
                    scienceSegs[ifo] = scienceSegs[ifo] - missing

        if checkSegmentGaps == "raise_error" and missingData:
            raise ValueError("Workflow cannot find needed data, exiting.")
        logging.info("Done checking, any discrepancies are reported above.")
    elif checkSegmentGaps == "no_test":
        # Do nothing
        pass
    else:
        errMsg = "checkSegmentGaps kwArg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Do all of the frame files that were returned actually exist?
    if checkFramesExist in ["warn", "update_times", "raise_error"]:
        logging.info("Verifying that all frames exist on disk.")
        missingFrSegs, missingFrames = get_missing_segs_from_frame_file_cache(datafindcaches)
        missingFlag = False
        for ifo in missingFrames.keys():
            # If no data in the input then do nothing
            if not scienceSegs[ifo]:
                continue
            # If using a backup server, does the frame exist remotely?
            if using_backup_server:
                # WARNING: This will be slow, but hopefully it will not occur
                #          for too many frames. This could be optimized if
                #          it becomes necessary.
                new_list = []
                for frame in missingFrames[ifo]:
                    for dfout in datafindouts:
                        dfout_pfns = list(dfout.pfns)
                        dfout_urls = [a.url for a in dfout_pfns]
                        if frame.url in dfout_urls:
                            pfn = dfout_pfns[dfout_urls.index(frame.url)]
                            dfout.removePFN(pfn)
                            if len(dfout.pfns) == 0:
                                new_list.append(frame)
                            else:
                                msg = "Frame %s not found locally. " % (frame.url,)
                                msg += "Replacing with remote url(s) "
                                msg += "%s." % (str([a.url for a in dfout.pfns]),)
                                logging.info(msg)
                            break
                    else:
                        new_list.append(frame)
                missingFrames[ifo] = new_list
            if missingFrames[ifo]:
                msg = "From ifo %s we are missing the following frames:" % (ifo)
                msg += "\n".join([a.url for a in missingFrames[ifo]])
                missingFlag = True
                logging.error(msg)
            if checkFramesExist == "update_times":
                # Remove missing times, so that we can carry on if desired
                logging.info("Updating science times for ifo %s." % (ifo))
                scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo]

        if checkFramesExist == "raise_error" and missingFlag:
            raise ValueError("Workflow cannot find all frames, exiting.")
        logging.info("Finished checking frames.")
    elif checkFramesExist == "no_test":
        # Do nothing
        pass
    else:
        errMsg = "checkFramesExist kwArg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Check if there are cases where frames exist, but no entry in the segment
    # summary table are present.
    if checkSegmentSummary in ["warn", "raise_error"]:
        logging.info("Checking the segment summary table against frames.")
        dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        missingFlag = False
        for ifo in dfScienceSegs.keys():
            scienceFile = segFilesList.find_output_with_ifo(ifo)
            scienceFile = scienceFile.find_output_with_tag("SCIENCE")
            if not len(scienceFile) == 1:
                errMsg = "Did not find exactly 1 science file."
                raise ValueError(errMsg)
            scienceFile = scienceFile[0]

            scienceChannel = cp.get("workflow-segments", "segments-%s-science-name" % (ifo.lower()))
            segSummaryTimes = get_segment_summary_times(scienceFile, scienceChannel)
            missing = dfScienceSegs[ifo] - segSummaryTimes
            scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo]
            missing2 = scienceSegs[ifo] - scienceButNotFrame
            missing2 = missing2 - segSummaryTimes
            if abs(missing):
                msg = "From ifo %s the following times have frames, " % (ifo)
                msg += "but are not covered in the segment summary table."
                msg += "\n%s" % "\n".join(map(str, missing))
                logging.error(msg)
                missingFlag = True
            if abs(missing2):
                msg = "From ifo %s the following times have frames, " % (ifo)
                msg += "are science, and are not covered in the segment "
                msg += "summary table."
                msg += "\n%s" % "\n".join(map(str, missing2))
                logging.error(msg)
                missingFlag = True
        if checkSegmentSummary == "raise_error" and missingFlag:
            errMsg = "Segment_summary discrepancy detected, exiting."
            raise ValueError(errMsg)
    elif checkSegmentSummary == "no_test":
        # Do nothing
        pass
    else:
        errMsg = "checkSegmentSummary kwArg must take a value from 'no_test', "
        errMsg += "'warn', or 'raise_error'."
        raise ValueError(errMsg)

    # Now need to create the file for SCIENCE_AVAILABLE
    for ifo in scienceSegs.keys():
        availableSegsFile = os.path.abspath(
            os.path.join(outputDir, "%s-SCIENCE_AVAILABLE_SEGMENTS.xml" % (ifo.upper()))
        )
        currUrl = urlparse.urlunparse(["file", "localhost", availableSegsFile, None, None, None])
        if tag:
            currTags = [tag, "SCIENCE_AVAILABLE"]
        else:
            currTags = ["SCIENCE_AVAILABLE"]
        currFile = OutSegFile(
            ifo, "SEGMENTS", workflow.analysis_time, currUrl, segment_list=scienceSegs[ifo], tags=currTags
        )
        currFile.PFN(availableSegsFile, site="local")
        segFilesList.append(currFile)
        currFile.toSegmentXml()

    logging.info("Leaving datafind module")
    return FileList(datafindouts), scienceSegs
Beispiel #2
0
def setup_segment_gen_mixed(workflow, veto_categories, out_dir, 
                            maxVetoAtRunTime, tag=None,
                            generate_coincident_segs=True):
    """
    This function will generate veto files for each ifo and for each veto
    category.
    It can generate these vetoes at run-time or in the workflow (or do some at
    run-time and some in the workflow). However, the CAT_1 vetoes and science
    time must be generated at run time as they are needed to plan the workflow.
    CATs 2 and higher *may* be needed for other workflow construction.
    It can also combine these files to create a set of cumulative,
    multi-detector veto files, which can be used in ligolw_thinca and in
    pipedown. Again these can be created at run time or within the workflow.

    Parameters
    -----------
    workflow : pycbc.workflow.core.Workflow
        The Workflow instance that the coincidence jobs will be added to.
        This instance also contains the ifos for which to attempt to obtain
        segments for this analysis and the start and end times to search for
        segments over.
    veto_categories : list of ints
        List of veto categories to generate segments for. If this stops being
        integers, this can be changed here.
    out_dir : path
        The directory in which output will be stored.    
    maxVetoAtRunTime : int
        Generate veto files at run time up to this category. Veto categories
        beyond this in veto_categories will be generated in the workflow.
        If we move to a model where veto
        categories are not explicitly cumulative, this will be rethought.
    tag : string, optional (default=None)
        Use this to specify a tag. This can be used if this module is being
        called more than once to give call specific configuration (by setting
        options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). This
        is also used to tag the Files returned by the class to uniqueify
        the Files and uniqueify the actual filename.
        FIXME: Filenames may not be unique with current codes!
    generate_coincident_segs : boolean, optional (default = True)
        If given this module will generate a set of coincident, cumulative veto
        files that can be used with ligolw_thinca and pipedown.

    Returns
    -------
    segFilesList : dictionary of pycbc.workflow.core.SegFile instances
        These are representations of the various segment files that were
        constructed
        at this stage of the workflow and may be needed at later stages of the
        analysis (e.g. for performing DQ vetoes). If the file was generated at
        run-time the segment lists contained within these files will be an
        attribute
        of the instance. (If it will be generated in the workflow it will 
        not be because I am not psychic).
    """
    cp = workflow.cp
    segFilesList = FileList([])
    start_time = workflow.analysis_time[0]
    end_time = workflow.analysis_time[1]
    segValidSeg = workflow.analysis_time
    # Will I need to add some jobs to the workflow?
    vetoGenJob = create_segs_from_cats_job(cp, out_dir, workflow.ifo_string)
    
    for ifo in workflow.ifos:
        logging.info("Generating science segments for ifo %s" %(ifo))
        currSciSegs, currSciXmlFile = get_science_segments(ifo, cp, start_time,
                                                    end_time, out_dir, tag=tag)
        segFilesList.append(currSciXmlFile)

        for category in veto_categories:
            if category > maxVetoAtRunTime:
                msg = "Adding creation of CAT_%d segments " %(category)
                msg += "for ifo %s to workflow." %(ifo)
                logging.info(msg)
                execute_status = False
                                 
            if category <= maxVetoAtRunTime:
                logging.info("Generating CAT_%d segments for ifo %s." \
                             %(category,ifo))
                execute_status = True

            currVetoXmlFile = get_veto_segs(workflow, ifo, category, 
                                                start_time, end_time, out_dir,
                                                vetoGenJob, 
                                                execute_now=execute_status)  

            segFilesList.append(currVetoXmlFile) 
            # Store the CAT_1 veto segs for use below
            if category == 1:
                # Yes its yucky to generate a file and then read it back in. 
                #This will be
                # fixed when the new API for segment generation is ready.
                vetoXmlFP = open(currVetoXmlFile.storage_path, 'r')
                cat1Segs = fromsegmentxml(vetoXmlFP)
                vetoXmlFP.close()
                
        analysedSegs = currSciSegs - cat1Segs
        analysedSegs.coalesce()
        analysedXmlFile = os.path.join(out_dir,
                             "%s-SCIENCE_OK_SEGMENTS.xml" %(ifo.upper()) )
        currUrl = urlparse.urlunparse(['file', 'localhost', analysedXmlFile,
                          None, None, None])
        if tag:
            currTags = [tag, 'SCIENCE_OK']
        else:
            currTags = ['SCIENCE_OK']
        currFile = OutSegFile(ifo, 'SEGMENTS',
                              segValidSeg, currUrl, segment_list=analysedSegs,
                              tags = currTags)
        segFilesList.append(currFile)
        currFile.toSegmentXml()


    if generate_coincident_segs:
        # Need to make some combined category veto files to use when vetoing
        # segments and triggers.
        ifo_string = workflow.ifo_string
        categories = []
        cum_cat_files = []
        for category in veto_categories:
            categories.append(category)
            # Set file name in workflow standard
            if tag:
                currTags = [tag, 'CUMULATIVE_CAT_%d' %(category)]
            else:
                currTags = ['CUMULATIVE_CAT_%d' %(category)]

            cumulativeVetoFile = os.path.join(out_dir,
                                   '%s-CUMULATIVE_CAT_%d_VETO_SEGMENTS.xml' \
                                   %(ifo_string, category) )
            currUrl = urlparse.urlunparse(['file', 'localhost',
                                         cumulativeVetoFile, None, None, None])
            currSegFile = OutSegFile(ifo_string, 'SEGMENTS',
                                   segValidSeg, currUrl, tags=currTags)
            # And actually make the file (or queue it in the workflow)
            logging.info("Generating combined, cumulative CAT_%d segments."\
                             %(category))
            if category <= maxVetoAtRunTime:
                execute_status = True
            else:
                execute_status = False
            get_cumulative_segs(workflow, currSegFile,  categories,
                                segFilesList, out_dir, 
                                execute_now=execute_status)

            segFilesList.append(currSegFile)
            cum_cat_files.append(currSegFile)
        # Create a combined file
        # Set file tag in workflow standard
        if tag:
            currTags = [tag, 'COMBINED_CUMULATIVE_SEGMENTS']
        else:
            currTags = ['COMBINED_CUMULATIVE_SEGMENTS']

        combined_veto_file = os.path.join(out_dir,
                               '%s-CUMULATIVE_ALL_CATS_SEGMENTS.xml' \
                               %(ifo_string) )
        curr_url = urlparse.urlunparse(['file', 'localhost',
                                       combined_veto_file, None, None, None])
        curr_file = OutSegFile(ifo_string, 'SEGMENTS',
                               segValidSeg, curr_url, tags=currTags)

        for category in veto_categories:
            if category <= maxVetoAtRunTime:
                execute_status = True
                break
        else:
            execute_status = False
        add_cumulative_files(workflow, curr_file, cum_cat_files, out_dir,
                             execute_now=execute_status)
        segFilesList.append(curr_file)

    return segFilesList
Beispiel #3
0
def get_triggered_coherent_segment(workflow, out_dir, sciencesegs, tag=None):
    """
    Construct the coherent network on and off source segments.

    Parameters
    -----------
    workflow : pycbc.workflow.core.Workflow
        The workflow instance that the coincidence jobs will be added to.
        This instance also contains the ifos for which to attempt to obtain
        segments for this analysis and the start and end times to search for
        segments over.
    out_dir : path
        The directory in which output will be stored.
    sciencesegs : dictionary
        Dictionary of science segments produced by
        ahope.setup_segment_generation()
    tag : string, optional (default=None)
        Use this to specify a tag.

    Returns
    --------
    onsource : glue.segments.segmentlistdict
        A dictionary containing the on source segments for network IFOs

    offsource : glue.segments.segmentlistdict
        A dictionary containing the off source segments for network IFOs
    """
    logging.info("Calculating optimal coherent segment.")

    # Load parsed workflow config options
    cp = workflow.cp
    ra = float(os.path.basename(cp.get('workflow', 'ra')))
    dec = float(os.path.basename(cp.get('workflow', 'dec')))
    triggertime = int(os.path.basename(cp.get('workflow', 'trigger-time')))
    
    minbefore = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                            'min-before')))
    minafter = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                           'min-after')))
    minduration = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                              'min-duration')))
    maxduration = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                              'max-duration')))
    onbefore = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                           'on-before')))
    onafter = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                          'on-after')))
    padding = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                          'pad-data')))
    quanta = int(os.path.basename(cp.get('workflow-exttrig_segments',
                                         'quanta')))

    # Check available data segments meet criteria specified in arguments
    sciencesegs = segments.segmentlistdict(sciencesegs)
    sciencesegs = sciencesegs.extract_common(sciencesegs.keys())
    if triggertime not in sciencesegs[sciencesegs.keys()[0]]:
        logging.error("Trigger is not contained within any available segment."
                      " Exiting.")
        sys.exit()

    offsrclist = sciencesegs[sciencesegs.keys()[0]]
    if len(offsrclist) > 1:
        logging.info("Removing network segments that do not contain trigger "
                     "time")
        for seg in offsrclist:
            if triggertime in seg:
                offsrc = seg
    else:
        offsrc = offsrclist[0]

    if (triggertime - minbefore - padding not in offsrc) or (
            triggertime + minafter + padding not in offsrc):
        logging.error("Not enough data either side of trigger time. Exiting.")
        sys.exit()

    if abs(offsrc) < minduration + 2 * padding:
        logging.error("Available network segment shorter than minimum allowed "
                      "duration. Exiting.")
        sys.exit()

    # Will segment duration be the maximum desired length or not?
    if abs(offsrc) >= maxduration + 2 * padding:
        logging.info("Available network science segment duration (%ds) is "
                     "greater than the maximum allowed segment length (%ds). "
                     "Truncating..." % (abs(offsrc), maxduration))
    else:
        logging.info("Available network science segment duration (%ds) is "
                     "less than the maximum allowed segment length (%ds)."
                     % (abs(offsrc), maxduration))

    logging.info("%ds of padding applied at beginning and end of segment."
                 % padding)

    # Maximal, centred coherent network segment
    idealsegment = segments.segment(int(triggertime - padding -
                                    0.5 * maxduration),
                                    int(triggertime + padding +
                                    0.5 * maxduration))

    # Construct off-source
    if (idealsegment in offsrc):
        offsrc = idealsegment

    elif idealsegment[1] not in offsrc:
        offsrc &= segments.segment(offsrc[1] - maxduration - 2 * padding,
                                   offsrc[1])

    elif idealsegment[0] not in offsrc:
        offsrc &= segments.segment(offsrc[0],
                                   offsrc[0] + maxduration + 2 * padding)

    # Trimming off-source
    excess = abs(offsrc) % quanta - 2 * padding
    if excess != 0:
        logging.info("Trimming %ds excess time to make OFF-SOURCE duration a "
                     "multiple of %ds" % (excess, quanta))
        offset = (offsrc[0] + abs(offsrc) / 2.) - triggertime
        if 2 * abs(offset) > excess:
            if offset < 0:
                offsrc &= segments.segment(offsrc[0] + excess,
                                           offsrc[1])
            elif offset > 0:
                offsrc &= segments.segment(offsrc[0],
                                           offsrc[1] - excess)
            assert abs(offsrc) % quanta == 2 * padding
        else:
            logging.info("This will make OFF-SOURCE symmetrical about trigger "
                         "time.")
            offsrc = segments.segment(offsrc[0] - offset + excess / 2,
                                      offsrc[1] - offset - excess / 2)
            assert abs(offsrc) % quanta == 2 * padding

    logging.info("Constructed OFF-SOURCE: duration %ds (%ds before to %ds "
                 "after trigger)."
                 % (abs(offsrc) - 2 * padding,
                    triggertime - offsrc[0] - padding,
                    offsrc[1] - triggertime - padding))
    offsrc = segments.segmentlist([offsrc])

    # Construct on-source
    onsrc = segments.segment(triggertime - onbefore,
                             triggertime + onafter)
    logging.info("Constructed ON-SOURCE: duration %ds (%ds before to %ds after"
                 " trigger)."
                 % (abs(onsrc), triggertime - onsrc[0],
                    onsrc[1] - triggertime))
    onsrc = segments.segmentlist([onsrc])

    # Put segments into segmentlistdicts
    onsource = offsource = segments.segmentlistdict()
    ifos = ''
    for iifo in sciencesegs.keys():
        ifos += str(iifo)
        onsource[iifo] = onsrc
        offsource[iifo] = offsrc

    # Write off-source to xml file
    XmlFile = os.path.join(out_dir,
                           "%s-COH_OFFSOURCE_SEGMENT.xml" % ifos.upper())
    currUrl = urlparse.urlunparse(['file', 'localhost', XmlFile, None, None,
                                   None])
    currFile = OutSegFile(ifos, 'COH-OFFSOURCE', offsource[iifo], currUrl,
                          offsource[iifo])
    currFile.toSegmentXml()
    logging.info("Optimal coherent segment calculated.")

    return onsource, offsource
Beispiel #4
0
def setup_datafind_workflow(workflow, scienceSegs,  outputDir, segFilesList,
                            tag=None):
    """
    Setup datafind section of the workflow. This section is responsible for
    generating, or setting up the workflow to generate, a list of files that
    record the location of the frame files needed to perform the analysis.
    There could be multiple options here, the datafind jobs could be done at
    run time or could be put into a dag. The subsequent jobs will know
    what was done here from the OutFileList containing the datafind jobs
    (and the Dagman nodes if appropriate.
    For now the only implemented option is to generate the datafind files at
    runtime. This module can also check if the frameFiles actually exist, check
    whether the obtained segments line up with the original ones and update the
    science segments to reflect missing data files.

    Parameters
    ----------
    workflow: pycbc.workflow.core.Workflow
        The workflow class that stores the jobs that will be run.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse.
    outputDir : path
        All output files written by datafind processes will be written to this
        directory.
    segFilesList : List of the files returned by segment_utils
        This contains representations of the various segment files that were
        constructed at the segment generation stage of the workflow. This will
        be used for the segment_summary test, or if any of the other tests are
        given "update_times" (and can be given a value of None otherwise).
    tag : string, optional (default=None)
        Use this to specify a tag. This can be used if this module is being
        called more than once to give call specific configuration (by setting
        options in [workflow-datafind-${TAG}] rather than [workflow-datafind]). 
        This is also used to tag the Files returned by the class to uniqueify
        the Files and uniqueify the actual filename.
        FIXME: Filenames may not be unique with current codes!

    Returns
    --------
    datafindOuts : OutGroupList
        List of all the datafind output files for use later in the pipeline.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse. If 
        the updateSegmentTimes kwarg is given this will be updated to reflect 
        any instances of missing data.
    """
    logging.info("Entering datafind module")
    make_analysis_dir(outputDir)
    cp = workflow.cp

    # Parse for options in ini file
    datafindMethod = cp.get_opt_tags("workflow-datafind",
                                     "datafind-method", [tag])

    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-segment-gaps", [tag]):
        checkSegmentGaps = cp.get_opt_tags("workflow-datafind", 
                                          "datafind-check-segment-gaps", [tag])
    else:
        checkSegmentGaps = "no_test"
    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-frames-exist", [tag]):
        checkFramesExist = cp.get_opt_tags("workflow-datafind",
                                          "datafind-check-frames-exist", [tag])
    else:
        checkFramesExist = "no_test"
    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-segment-summary", [tag]):
        checkSegmentSummary = cp.get_opt_tags("workflow-datafind",
                                       "datafind-check-segment-summary", [tag])
    else:
        checkSegmentSummary = "no_test"
    
    logging.info("Starting datafind with setup_datafind_runtime_generated")
    if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs,
                                                            outputDir, tag=tag)
    elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs, 
                                                            outputDir, tag=tag)
    elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs,
                                                            outputDir, tag=tag)
    elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs,
                                                            outputDir, tag=tag)

    elif datafindMethod == "FROM_PREGENERATED_LCF_FILES":
        ifos = scienceSegs.keys()
        datafindcaches, datafindouts = \
            setup_datafind_from_pregenerated_lcf_files(cp, ifos,
                                                       outputDir, tag=tag)
    else:
        msg = "Entry datafind-method in [workflow-datafind] does not have "
        msg += "expected value. Valid values are "
        msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES "
        msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. "
        msg += "Consult the documentation for more info."
        raise ValueError(msg)

    using_backup_server = False
    if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or \
                                  datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        if cp.has_option_tags("workflow-datafind",
                          "datafind-backup-datafind-server", [tag]):
            using_backup_server = True
            backup_server = cp.get_opt_tags("workflow-datafind",
                                      "datafind-backup-datafind-server", [tag])
            cp_new = copy.deepcopy(cp)
            cp_new.set("workflow-datafind",
                                "datafind-ligo-datafind-server", backup_server)
            cp_new.set('datafind', 'urltype', 'gsiftp')
            backup_datafindcaches, backup_datafindouts =\
                setup_datafind_runtime_frames_single_call_perifo(cp_new,
                                               scienceSegs, outputDir, tag=tag)
            backup_datafindouts = datafind_keep_unique_backups(\
                                             backup_datafindouts, datafindouts)
            datafindcaches.extend(backup_datafindcaches)
            datafindouts.extend(backup_datafindouts)

    logging.info("setup_datafind_runtime_generated completed")
    # If we don't have frame files covering all times we can update the science
    # segments.
    if checkSegmentGaps in ['warn','update_times','raise_error']:
        logging.info("Checking science segments against datafind output....")
        newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        logging.info("Datafind segments calculated.....")
        missingData = False
        msg = "Any errors directly following this message refer to times that"
        msg += " the segment server says are science, but datafind cannot find"
        msg += "frames for:"
        logging.info(msg)
        for ifo in scienceSegs.keys():
            # If no data in the input then do nothing
            if not scienceSegs[ifo]:
                msg = "No input science segments for ifo %s " %(ifo)
                msg += "so, surprisingly, no data has been found. "
                msg += "Was this expected?"
                logging.warning(msg)
                continue
            if not newScienceSegs.has_key(ifo):
                msg = "IFO %s's science segments " %(ifo)
                msg += "are completely missing."
                logging.error(msg)
                missingData = True
                if checkSegmentGaps == 'update_times':
                    scienceSegs[ifo] = segments.segmentlist()
                continue
            missing = scienceSegs[ifo] - newScienceSegs[ifo]
            if abs(missing):
                msg = "From ifo %s we are missing frames covering:" %(ifo)
                msg += "\n%s" % "\n".join(map(str, missing))
                missingData = True
                logging.error(msg)
                if checkSegmentGaps == 'update_times':
                    # Remove missing time, so that we can carry on if desired
                    logging.info("Updating science times for ifo %s." %(ifo))
                    scienceSegs[ifo] = scienceSegs[ifo] - missing

        if checkSegmentGaps == 'raise_error' and missingData:
            raise ValueError("Workflow cannot find needed data, exiting.")
        logging.info("Done checking, any discrepancies are reported above.")
    elif checkSegmentGaps == 'no_test':
        # Do nothing
        pass
    else:
        errMsg = "checkSegmentGaps kwArg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Do all of the frame files that were returned actually exist?
    if checkFramesExist in ['warn','update_times','raise_error']:
        logging.info("Verifying that all frames exist on disk.")
        missingFrSegs, missingFrames = \
                          get_missing_segs_from_frame_file_cache(datafindcaches)
        missingFlag = False
        for ifo in missingFrames.keys():
            # If no data in the input then do nothing
            if not scienceSegs[ifo]:
                continue
            # If using a backup server, does the frame exist remotely?
            if using_backup_server:
                # WARNING: This will be slow, but hopefully it will not occur
                #          for too many frames. This could be optimized if
                #          it becomes necessary.
                new_list = []
                for frame in missingFrames[ifo]:
                    for dfout in datafindouts:
                        dfout_pfns = list(dfout.pfns)
                        dfout_urls = [a.url for a in dfout_pfns]
                        if frame.url in dfout_urls:
                            pfn = dfout_pfns[dfout_urls.index(frame.url)]
                            dfout.removePFN(pfn)
                            if len(dfout.pfns) == 0:
                                new_list.append(frame)
                            else:
                                msg = "Frame %s not found locally. "\
                                                                  %(frame.url,)
                                msg += "Replacing with remote url(s) "
                                msg += "%s." \
                                           %(str([a.url for a in dfout.pfns]),)
                                logging.info(msg)
                            break
                    else:
                        new_list.append(frame)
                missingFrames[ifo] = new_list
            if missingFrames[ifo]:
                msg = "From ifo %s we are missing the following frames:" %(ifo)
                msg +='\n'.join([a.url for a in missingFrames[ifo]])
                missingFlag = True
                logging.error(msg)
            if checkFramesExist == 'update_times':
                # Remove missing times, so that we can carry on if desired
                logging.info("Updating science times for ifo %s." %(ifo))
                scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo]
                
        if checkFramesExist == 'raise_error' and missingFlag:
            raise ValueError("Workflow cannot find all frames, exiting.")
        logging.info("Finished checking frames.")
    elif checkFramesExist == 'no_test':
        # Do nothing
        pass
    else:
        errMsg = "checkFramesExist kwArg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Check if there are cases where frames exist, but no entry in the segment
    # summary table are present.
    if checkSegmentSummary in ['warn', 'raise_error']:
        logging.info("Checking the segment summary table against frames.")
        dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        missingFlag = False
        for ifo in dfScienceSegs.keys():
            scienceFile = segFilesList.find_output_with_ifo(ifo)
            scienceFile = scienceFile.find_output_with_tag('SCIENCE')
            if not len(scienceFile) == 1:
                errMsg = "Did not find exactly 1 science file."
                raise ValueError(errMsg)
            scienceFile = scienceFile[0]

            scienceChannel = cp.get('workflow-segments',\
                                'segments-%s-science-name'%(ifo.lower()))
            segSummaryTimes = get_segment_summary_times(scienceFile,
                                                        scienceChannel)
            missing = dfScienceSegs[ifo] - segSummaryTimes
            scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo]
            missing2 = scienceSegs[ifo] - scienceButNotFrame
            missing2 = missing2 - segSummaryTimes
            if abs(missing):
                msg = "From ifo %s the following times have frames, " %(ifo)
                msg += "but are not covered in the segment summary table."
                msg += "\n%s" % "\n".join(map(str, missing))
                logging.error(msg)
                missingFlag = True
            if abs(missing2):
                msg = "From ifo %s the following times have frames, " %(ifo)
                msg += "are science, and are not covered in the segment "
                msg += "summary table."
                msg += "\n%s" % "\n".join(map(str, missing2))
                logging.error(msg)
                missingFlag = True
        if checkSegmentSummary == 'raise_error' and missingFlag:
            errMsg = "Segment_summary discrepancy detected, exiting."
            raise ValueError(errMsg)
    elif checkSegmentSummary == 'no_test':
        # Do nothing
        pass
    else:
        errMsg = "checkSegmentSummary kwArg must take a value from 'no_test', "
        errMsg += "'warn', or 'raise_error'."
        raise ValueError(errMsg)

    # Now need to create the file for SCIENCE_AVAILABLE
    for ifo in scienceSegs.keys():
        availableSegsFile = os.path.abspath(os.path.join(outputDir, 
                           "%s-SCIENCE_AVAILABLE_SEGMENTS.xml" %(ifo.upper()) ))
        currUrl = urlparse.urlunparse(['file', 'localhost', availableSegsFile,
                          None, None, None])
        if tag:
            currTags = [tag, 'SCIENCE_AVAILABLE']
        else:
            currTags = ['SCIENCE_AVAILABLE']
        currFile = OutSegFile(ifo, 'SEGMENTS', workflow.analysis_time,
                            currUrl, segment_list=scienceSegs[ifo], tags = currTags)
        currFile.PFN(availableSegsFile, site='local')
        segFilesList.append(currFile)
        currFile.toSegmentXml()
   

    logging.info("Leaving datafind module")
    return FileList(datafindouts), scienceSegs