Esempio n. 1
0
def setup_psd_calculate(workflow, frame_files, ifo, segments,
                        segment_name, out_dir, tags=None):
    make_analysis_dir(out_dir)
    tags = [] if not tags else tags
    if workflow.cp.has_option_tags('workflow-psd', 'parallelization-factor', tags=tags):
        num_parts = int(workflow.cp.get_opt_tags('workflow-psd',
                                                 'parallelization-factor',
                                                 tags=tags))
    else:
        num_parts = 1

    # get rid of duplicate segments which happen when splitting the bank
    segments = segmentlist(frozenset(segments))

    segment_lists = list(chunks(segments, num_parts))

    psd_files = FileList([])
    for i, segs in enumerate(segment_lists):
        seg_file = SegFile.from_segment_list('%s_%s' %(segment_name, i),
                         segmentlist(segs), segment_name, ifo,
                         valid_segment=workflow.analysis_time,
                         extension='xml', directory=out_dir)

        psd_files += [make_psd_file(workflow, frame_files, seg_file,
                                    segment_name, out_dir,
                                    tags=tags + ['PART%s' % i])]

    if num_parts > 1:
        return merge_psds(workflow, psd_files, ifo, out_dir, tags=tags)
    else:
        return psd_files[0]
Esempio n. 2
0
def render_default(path, cp):
    """ This is the default function that will render a template to a string of HTML. The
    string will be for a drop-down tab that contains a link to the file.

    If the file extension requires information to be read, then that is passed to the
    content variable (eg. a segmentlistdict).
    """

    # define filename and slug from path
    filename = os.path.basename(path)
    slug = filename.replace('.', '_')

    # initializations
    content = None

    if path.endswith('.xml') or path.endswith('.xml.gz'):
        # segment or veto file return a segmentslistdict instance
        with open(path, 'r') as xmlfile:
            try:
                wf_file = SegFile.from_segment_xml(path)
                # FIXME: This is a dictionary, but the code wants a segmentlist
                #        for now I just coalesce.
                seg_dict = wf_file.return_union_seglist()
            except Exception as e:
                print 'No segment table found in', path, ':', e

    # render template
    template_dir = pycbc.results.__path__[0] + '/templates/files'
    env = Environment(loader=FileSystemLoader(template_dir))
    env.globals.update(abs=abs)
    env.globals.update(open=open)
    env.globals.update(path_exists=os.path.exists)
    template = env.get_template('file_default.html')
    context = {
        'path': path,
        'filename': filename,
        'slug': slug,
        'cp': cp,
        'content': content
    }
    output = template.render(context)

    return output
Esempio n. 3
0
def render_default(path, cp):
    """ This is the default function that will render a template to a string of HTML. The
    string will be for a drop-down tab that contains a link to the file.

    If the file extension requires information to be read, then that is passed to the
    content variable (eg. a segmentlistdict).
    """

    # define filename and slug from path
    filename = os.path.basename(path)
    slug = filename.replace('.', '_')

    # initializations
    content = None

    if path.endswith('.xml') or path.endswith('.xml.gz'):
        # segment or veto file return a segmentslistdict instance
        with open(path, 'r') as xmlfile:
            try:
                wf_file = SegFile.from_segment_xml(path)
                # FIXME: This is a dictionary, but the code wants a segmentlist
                #        for now I just coalesce.
                seg_dict = wf_file.return_union_seglist()
            except Exception as e:
                print 'No segment table found in', path, ':', e

    # render template
    template_dir = pycbc.results.__path__[0] + '/templates/files'
    env = Environment(loader=FileSystemLoader(template_dir))
    env.globals.update(abs=abs)
    env.globals.update(open=open)
    env.globals.update(path_exists=os.path.exists)
    template = env.get_template('file_default.html')
    context = {'path'     : path,
               'filename' : filename,
               'slug'     : slug,
               'cp'       : cp,
               'content'  : content}
    output = template.render(context)

    return output
Esempio n. 4
0
def setup_datafind_workflow(workflow, scienceSegs, outputDir, seg_file=None,
                            tags=None):
    """
    Setup datafind section of the workflow. This section is responsible for
    generating, or setting up the workflow to generate, a list of files that
    record the location of the frame files needed to perform the analysis.
    There could be multiple options here, the datafind jobs could be done at
    run time or could be put into a dag. The subsequent jobs will know
    what was done here from the OutFileList containing the datafind jobs
    (and the Dagman nodes if appropriate.
    For now the only implemented option is to generate the datafind files at
    runtime. This module can also check if the frameFiles actually exist, check
    whether the obtained segments line up with the original ones and update the
    science segments to reflect missing data files.

    Parameters
    ----------
    workflow: pycbc.workflow.core.Workflow
        The workflow class that stores the jobs that will be run.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse.
    outputDir : path
        All output files written by datafind processes will be written to this
        directory.
    seg_file : SegFile, optional (default=None)
        The file returned by get_science_segments containing the science
        segments and the associated segment_summary. This will
        be used for the segment_summary test and is required if, and only if,
        performing that test.
    tags : list of string, optional (default=None)
        Use this to specify tags. This can be used if this module is being
        called more than once to give call specific configuration (by setting
        options in [workflow-datafind-${TAG}] rather than [workflow-datafind]).
        This is also used to tag the Files returned by the class to uniqueify
        the Files and uniqueify the actual filename.
        FIXME: Filenames may not be unique with current codes!

    Returns
    --------
    datafindOuts : OutGroupList
        List of all the datafind output files for use later in the pipeline.
    sci_avlble_file : SegFile
        SegFile containing the analysable time after checks in the datafind
        module are applied to the input segment list. For production runs this
        is expected to be equal to the input segment list.
    scienceSegs : Dictionary of ifo keyed glue.segment.segmentlist instances
        This contains the times that the workflow is expected to analyse. If
        the updateSegmentTimes kwarg is given this will be updated to reflect
        any instances of missing data.
    sci_avlble_name : string
        The name with which the analysable time is stored in the
        sci_avlble_file.
    """
    if tags is None:
        tags = []
    logging.info("Entering datafind module")
    make_analysis_dir(outputDir)
    cp = workflow.cp

    # Parse for options in ini file
    datafindMethod = cp.get_opt_tags("workflow-datafind",
                                     "datafind-method", tags)

    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-segment-gaps", tags):
        checkSegmentGaps = cp.get_opt_tags("workflow-datafind",
                                          "datafind-check-segment-gaps", tags)
    else:
        checkSegmentGaps = "no_test"
    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-frames-exist", tags):
        checkFramesExist = cp.get_opt_tags("workflow-datafind",
                                          "datafind-check-frames-exist", tags)
    else:
        checkFramesExist = "no_test"
    if cp.has_option_tags("workflow-datafind",
                          "datafind-check-segment-summary", tags):
        checkSegmentSummary = cp.get_opt_tags("workflow-datafind",
                                       "datafind-check-segment-summary", tags)
    else:
        checkSegmentSummary = "no_test"

    logging.info("Starting datafind with setup_datafind_runtime_generated")
    if datafindMethod == "AT_RUNTIME_MULTIPLE_CACHES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_cache_multi_calls_perifo(cp, scienceSegs,
                                                          outputDir, tags=tags)
    elif datafindMethod == "AT_RUNTIME_SINGLE_CACHES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_cache_single_call_perifo(cp, scienceSegs,
                                                          outputDir, tags=tags)
    elif datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_frames_multi_calls_perifo(cp, scienceSegs,
                                                          outputDir, tags=tags)
    elif datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        datafindcaches, datafindouts = \
            setup_datafind_runtime_frames_single_call_perifo(cp, scienceSegs,
                                                          outputDir, tags=tags)

    elif datafindMethod == "FROM_PREGENERATED_LCF_FILES":
        ifos = scienceSegs.keys()
        datafindcaches, datafindouts = \
            setup_datafind_from_pregenerated_lcf_files(cp, ifos,
                                                       outputDir, tags=tags)
    else:
        msg = "Entry datafind-method in [workflow-datafind] does not have "
        msg += "expected value. Valid values are "
        msg += "AT_RUNTIME_MULTIPLE_FRAMES, AT_RUNTIME_SINGLE_FRAMES "
        msg += "AT_RUNTIME_MULTIPLE_CACHES or AT_RUNTIME_SINGLE_CACHES. "
        msg += "Consult the documentation for more info."
        raise ValueError(msg)

    using_backup_server = False
    if datafindMethod == "AT_RUNTIME_MULTIPLE_FRAMES" or \
                                  datafindMethod == "AT_RUNTIME_SINGLE_FRAMES":
        if cp.has_option_tags("workflow-datafind",
                          "datafind-backup-datafind-server", tags):
            using_backup_server = True
            backup_server = cp.get_opt_tags("workflow-datafind",
                                      "datafind-backup-datafind-server", tags)
            cp_new = copy.deepcopy(cp)
            cp_new.set("workflow-datafind",
                                "datafind-ligo-datafind-server", backup_server)
            cp_new.set('datafind', 'urltype', 'gsiftp')
            backup_datafindcaches, backup_datafindouts =\
                setup_datafind_runtime_frames_single_call_perifo(cp_new,
                                             scienceSegs, outputDir, tags=tags)
            backup_datafindouts = datafind_keep_unique_backups(\
                                             backup_datafindouts, datafindouts)
            datafindcaches.extend(backup_datafindcaches)
            datafindouts.extend(backup_datafindouts)

    logging.info("setup_datafind_runtime_generated completed")
    # If we don't have frame files covering all times we can update the science
    # segments.
    if checkSegmentGaps in ['warn','update_times','raise_error']:
        logging.info("Checking science segments against datafind output....")
        newScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        logging.info("New segments calculated from data find output.....")
        missingData = False
        for ifo in scienceSegs.keys():
            # If no science segments in input then do nothing
            if not scienceSegs[ifo]:
                msg = "No science segments are present for ifo %s, " %(ifo)
                msg += "the segment metadata indicates there is no analyzable"
                msg += " strain data between the selected GPS start and end "
                msg += "times."
                logging.warning(msg)
                continue
            if not newScienceSegs.has_key(ifo):
                msg = "No data frames were found corresponding to the science "
                msg += "segments for ifo %s" %(ifo)
                logging.error(msg)
                missingData = True
                if checkSegmentGaps == 'update_times':
                    scienceSegs[ifo] = segments.segmentlist()
                continue
            missing = scienceSegs[ifo] - newScienceSegs[ifo]
            if abs(missing):
                msg = "From ifo %s we are missing frames covering:" %(ifo)
                msg += "\n%s" % "\n".join(map(str, missing))
                missingData = True
                logging.error(msg)
                if checkSegmentGaps == 'update_times':
                    # Remove missing time, so that we can carry on if desired
                    logging.info("Updating science segments for ifo %s."
                                 %(ifo))
                    scienceSegs[ifo] = scienceSegs[ifo] - missing

        if checkSegmentGaps == 'raise_error' and missingData:
            raise ValueError("Workflow cannot find needed data, exiting.")
        logging.info("Done checking, any discrepancies are reported above.")
    elif checkSegmentGaps == 'no_test':
        pass
    else:
        errMsg = "checkSegmentGaps kwarg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Do all of the frame files that were returned actually exist?
    if checkFramesExist in ['warn','update_times','raise_error']:
        logging.info("Verifying that all frames exist on disk.")
        missingFrSegs, missingFrames = \
                          get_missing_segs_from_frame_file_cache(datafindcaches)
        missingFlag = False
        for ifo in missingFrames.keys():
            # If no data in the input then do nothing
            if not scienceSegs[ifo]:
                continue
            # If using a backup server, does the frame exist remotely?
            if using_backup_server:
                # WARNING: This will be slow, but hopefully it will not occur
                #          for too many frames. This could be optimized if
                #          it becomes necessary.
                new_list = []
                for frame in missingFrames[ifo]:
                    for dfout in datafindouts:
                        dfout_pfns = list(dfout.pfns)
                        dfout_urls = [a.url for a in dfout_pfns]
                        if frame.url in dfout_urls:
                            pfn = dfout_pfns[dfout_urls.index(frame.url)]
                            dfout.removePFN(pfn)
                            if len(dfout.pfns) == 0:
                                new_list.append(frame)
                            else:
                                msg = "Frame %s not found locally. "\
                                                                  %(frame.url,)
                                msg += "Replacing with remote url(s) %s." \
                                           %(str([a.url for a in dfout.pfns]),)
                                logging.info(msg)
                            break
                    else:
                        new_list.append(frame)
                missingFrames[ifo] = new_list
            if missingFrames[ifo]:
                msg = "From ifo %s we are missing the following frames:" %(ifo)
                msg +='\n'.join([a.url for a in missingFrames[ifo]])
                missingFlag = True
                logging.error(msg)
            if checkFramesExist == 'update_times':
                # Remove missing times, so that we can carry on if desired
                logging.info("Updating science times for ifo %s." %(ifo))
                scienceSegs[ifo] = scienceSegs[ifo] - missingFrSegs[ifo]

        if checkFramesExist == 'raise_error' and missingFlag:
            raise ValueError("Workflow cannot find all frames, exiting.")
        logging.info("Finished checking frames.")
    elif checkFramesExist == 'no_test':
        pass
    else:
        errMsg = "checkFramesExist kwarg must take a value from 'no_test', "
        errMsg += "'warn', 'update_times' or 'raise_error'."
        raise ValueError(errMsg)

    # Check if there are cases where frames exist, but no entry in the segment
    # summary table are present.
    if checkSegmentSummary in ['warn', 'raise_error']:
        logging.info("Checking the segment summary table against frames.")
        dfScienceSegs = get_science_segs_from_datafind_outs(datafindcaches)
        missingFlag = False
        # NOTE: Should this be overrideable in the config file?
        sci_seg_name = "SCIENCE"
        if seg_file is None:
            err_msg = "You must provide the science segments SegFile object "
            err_msg += "if using the datafind-check-segment-summary option."
            raise ValueError(err_msg)
        if seg_file.seg_summ_dict is None:
            err_msg = "The provided science segments SegFile object must "
            err_msg += "contain a valid segment_summary table if using the "
            err_msg += "datafind-check-segment-summary option."
            raise ValueError(err_msg)
        seg_summary_times = seg_file.seg_summ_dict
        for ifo in dfScienceSegs.keys():
            curr_seg_summ_times = seg_summary_times[ifo + ":" + sci_seg_name]
            missing = (dfScienceSegs[ifo] & seg_file.valid_segments)
            missing.coalesce()
            missing = missing - curr_seg_summ_times
            missing.coalesce()
            scienceButNotFrame = scienceSegs[ifo] - dfScienceSegs[ifo]
            scienceButNotFrame.coalesce()
            missing2 = scienceSegs[ifo] - scienceButNotFrame
            missing2.coalesce()
            missing2 = missing2 - curr_seg_summ_times
            missing2.coalesce()
            if abs(missing):
                msg = "From ifo %s the following times have frames, " %(ifo)
                msg += "but are not covered in the segment summary table."
                msg += "\n%s" % "\n".join(map(str, missing))
                logging.error(msg)
                missingFlag = True
            if abs(missing2):
                msg = "From ifo %s the following times have frames, " %(ifo)
                msg += "are science, and are not covered in the segment "
                msg += "summary table."
                msg += "\n%s" % "\n".join(map(str, missing2))
                logging.error(msg)
                missingFlag = True
        if checkSegmentSummary == 'raise_error' and missingFlag:
            errMsg = "Segment_summary discrepancy detected, exiting."
            raise ValueError(errMsg)
    elif checkSegmentSummary == 'no_test':
        pass
    else:
        errMsg = "checkSegmentSummary kwarg must take a value from 'no_test', "
        errMsg += "'warn', or 'raise_error'."
        raise ValueError(errMsg)

    # Now need to create the file for SCIENCE_AVAILABLE
    sci_avlble_dict = segments.segmentlistdict()
    # NOTE: Should this be overrideable in the config file?
    sci_avlble_name = "SCIENCE_AVAILABLE"
    for ifo in scienceSegs.keys():
        sci_avlble_dict[ifo + ':' + sci_avlble_name] = scienceSegs[ifo]

    sci_avlble_file = SegFile.from_segment_list_dict('SCIENCE_AVAILABLE',
                            sci_avlble_dict, ifo_list = scienceSegs.keys(),
                            valid_segment=workflow.analysis_time,
                            extension='.xml', tags=tags, directory=outputDir)

    logging.info("Leaving datafind module")
    return FileList(datafindouts), sci_avlble_file, scienceSegs, sci_avlble_name
Esempio n. 5
0
def get_segments_file(workflow, name, option_name, out_dir):
    """Get cumulative segments from option name syntax for each ifo.

    Use syntax of configparser string to define the resulting segment_file
    e.x. option_name = +up_flag1,+up_flag2,+up_flag3,-down_flag1,-down_flag2
    Each ifo may have a different string and is stored separately in the file.
    Flags which add time must precede flags which subtract time.

    Parameters
    ----------
    workflow: pycbc.workflow.Workflow
    name: string
        Name of the segment list being created
    option_name: str
        Name of option in the associated config parser to get the flag list

    returns
    --------
    seg_file: pycbc.workflow.SegFile
        SegFile intance that points to the segment xml file on disk.
    """
    from pycbc.dq import query_str
    make_analysis_dir(out_dir)
    cp = workflow.cp
    start = workflow.analysis_time[0]
    end = workflow.analysis_time[1]

    # Check for veto definer file
    veto_definer = None
    if cp.has_option("workflow-segments", "segments-veto-definer-url"):
        veto_definer = save_veto_definer(workflow.cp, out_dir, [])

    # Check for provided server
    server = "https://segments.ligo.org"
    if cp.has_option("workflow-segments", "segments-database-url"):
        server = cp.get("workflow-segments", "segments-database-url")

    source = "any"
    if cp.has_option("workflow-segments", "segments-source"):
        source = cp.get("workflow-segments", "segments-source")
    if source == "file":
        local_file_path = \
            resolve_url(cp.get("workflow-segments", option_name+"-file"))
        pfn = os.path.join(out_dir, os.path.basename(local_file_path))
        shutil.move(local_file_path, pfn)
        return SegFile.from_segment_xml(pfn)

    segs = {}
    for ifo in workflow.ifos:
        flag_str = cp.get_opt_tags("workflow-segments", option_name, [ifo])
        key = ifo + ':' + name
        segs[key] = query_str(ifo,
                              flag_str,
                              start,
                              end,
                              source=source,
                              server=server,
                              veto_definer=veto_definer)
        logging.info("%s: got %s flags", ifo, option_name)

    return SegFile.from_segment_list_dict(name,
                                          segs,
                                          extension='.xml',
                                          valid_segment=workflow.analysis_time,
                                          directory=out_dir)