Esempio n. 1
0
    def fetch_pipelines(self, protocol):
        """
        Fetch the mapping for a particular protocol, null if unmapped.

        :param str protocol: name/key for the protocol for which to fetch the
            pipeline(s)
        :return str | Iterable[str] | NoneType: pipeline(s) to which the given
            protocol is mapped, otherwise null
        """
        protocol_key = utils.alpha_cased(protocol)
        return self.protocol_mapping.get(protocol_key)
Esempio n. 2
0
def standardize_protocols(piface):
    """
    Handle casing and punctuation of protocol keys in pipeline interface.

    :param Mapping piface: Pipeline interface data to standardize.
    :return Mapping: Same as the input, but with protocol keys case and
        punctuation handled in a more uniform way for matching later.
    """
    assert PROTOMAP_KEY in piface, "For protocol mapping standardization, " \
        "pipeline interface data must contain key '{}'".format(PROTOMAP_KEY)
    piface[PROTOMAP_KEY] = {utils.alpha_cased(proto): pipekey for
                            proto, pipekey in piface[PROTOMAP_KEY].items()}
    return piface
Esempio n. 3
0
def process_pipeline_interfaces(pipeline_interface_locations):
    """
    Create a PipelineInterface for each pipeline location given.

    :param Iterable[str] pipeline_interface_locations: locations, each of
        which should be either a directory path or a filepath, that specifies
        pipeline interface and protocol mappings information. Each such file
        should have a pipelines section and a protocol mappings section.
    :return Mapping[str, Iterable[PipelineInterfaec]]: mapping from protocol
        name to interface(s) for which that protocol is mapped
    """
    interface_by_protocol = defaultdict(list)
    for pipe_iface_location in pipeline_interface_locations:
        if not os.path.exists(pipe_iface_location):
            _LOGGER.warning(
                "Ignoring nonexistent pipeline interface "
                "location: '%s'", pipe_iface_location)
            continue
        pipe_iface = PipelineInterface(pipe_iface_location)
        for proto_name in pipe_iface.protomap:
            _LOGGER.log(5, "Adding protocol name: '%s'", proto_name)
            interface_by_protocol[alpha_cased(proto_name)].append(pipe_iface)
    return interface_by_protocol
Esempio n. 4
0
    def build_submission_bundles(self, protocol, priority=True):
        """
        Create pipelines to submit for each sample of a particular protocol.

        With the argument (flag) to the priority parameter, there's control
        over whether to submit pipeline(s) from only one of the project's
        known pipeline locations with a match for the protocol, or whether to
        submit pipelines created from all locations with a match for the
        protocol.

        :param str protocol: name of the protocol/library for which to
            create pipeline(s)
        :param bool priority: to only submit pipeline(s) from the first of the
            pipelines location(s) (indicated in the project config file) that
            has a match for the given protocol; optional, default True
        :return Iterable[(PipelineInterface, type, str, str)]:
        :raises AssertionError: if there's a failure in the attempt to
            partition an interface's pipeline scripts into disjoint subsets of
            those already mapped and those not yet mapped
        """

        protocol = alpha_cased(protocol)

        if not priority:
            raise NotImplementedError(
                "Currently, only prioritized protocol mapping is supported "
                "(i.e., pipeline interfaces collection is a prioritized list, "
                "so only the first interface with a protocol match is used.)")

        # Pull out the collection of interfaces (potentially one from each of
        # the locations indicated in the project configuration file) as a
        # sort of pool of information about possible ways in which to submit
        # pipeline(s) for sample(s) of the indicated protocol.
        try:
            pipeline_interfaces = \
                self.interfaces_by_protocol[protocol]
        except KeyError:
            # Messaging can be done by the caller.
            _LOGGER.debug("No interface for protocol: %s", protocol)
            return []

        job_submission_bundles = []
        pipeline_keys_used = set()
        _LOGGER.debug("Building pipelines for {} interface(s)...".format(
            len(pipeline_interfaces)))

        bundle_by_strict_pipe_key = {}

        for pipe_iface in pipeline_interfaces:
            # "Break"-like mechanism for short-circuiting if we care only
            # about the highest-priority match for pipeline submission.
            # That is, if the intent is to submit pipeline(s) from a single
            # location for each sample of the given protocol, we can stop
            # searching the pool of pipeline interface information once we've
            # found a match for the protocol.
            if priority and len(job_submission_bundles) > 0:
                return job_submission_bundles[0]

            this_protocol_pipelines = pipe_iface.fetch_pipelines(protocol)
            if not this_protocol_pipelines:
                _LOGGER.debug("No pipelines; available: {}".format(", ".join(
                    pipe_iface.protomap.keys())))
                continue

            # TODO: update once dependency-encoding logic is in place.
            # The proposed dependency-encoding format uses a semicolon
            # between pipelines for which the dependency relationship is
            # serial. For now, simply treat those as multiple independent
            # pipelines by replacing the semicolon with a comma, which is the
            # way in which multiple independent pipelines for a single protocol
            # are represented in the mapping declaration.
            pipeline_keys = \
                this_protocol_pipelines.replace(";", ",") \
                    .strip(" ()\n") \
                    .split(",")
            # These cleaned pipeline keys are what's used to resolve the path
            # to the pipeline to run.
            pipeline_keys = [pk.strip() for pk in pipeline_keys]

            # Skip over pipelines already mapped by another location.
            already_mapped, new_scripts = \
                partition(pipeline_keys,
                          partial(_is_member, items=pipeline_keys_used))
            pipeline_keys_used |= set(pipeline_keys)

            # Attempt to validate that partition yielded disjoint subsets.
            try:
                disjoint_partition_violation = \
                    set(already_mapped) & set(new_scripts)
            except TypeError:
                _LOGGER.debug("Unable to hash partitions for validation")
            else:
                assert not disjoint_partition_violation, \
                    "Partitioning {} with membership in {} as " \
                    "predicate produced intersection: {}".format(
                        pipeline_keys, pipeline_keys_used,
                        disjoint_partition_violation)

            if len(already_mapped) > 0:
                _LOGGER.debug(
                    "Skipping {} already-mapped script name(s): {}".format(
                        len(already_mapped), already_mapped))
            _LOGGER.debug("{} new scripts for protocol {} from "
                          "pipeline(s) location '{}': {}".format(
                              len(new_scripts), protocol, pipe_iface.source,
                              new_scripts))

            # For each pipeline script to which this protocol will pertain,
            # create the new jobs/submission bundles.
            new_jobs = []
            for pipeline_key in new_scripts:
                # Determine how to reference the pipeline and where it is.
                strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \
                    pipe_iface.finalize_pipeline_key_and_paths(
                        pipeline_key)

                # Skip and warn about nonexistent alleged pipeline path.
                if not (os.path.exists(full_pipe_path)
                        or is_command_callable(full_pipe_path)):
                    _LOGGER.warninging("Missing pipeline script: '%s'",
                                       full_pipe_path)
                    continue

                # Determine which interface and Sample subtype to use.
                sample_subtype = \
                    pipe_iface.fetch_sample_subtype(
                        protocol, strict_pipe_key, full_pipe_path)

                # Package the pipeline's interface, subtype, command, and key.
                submission_bundle = SubmissionBundle(
                    pipe_iface, sample_subtype, strict_pipe_key,
                    full_pipe_path_with_flags)

                # Enforce bundle uniqueness for each strict pipeline key.
                maybe_new_bundle = (full_pipe_path_with_flags, sample_subtype,
                                    pipe_iface)
                old_bundle = bundle_by_strict_pipe_key.setdefault(
                    strict_pipe_key, maybe_new_bundle)
                if old_bundle != maybe_new_bundle:
                    errmsg = "Strict pipeline key '{}' maps to more than " \
                             "one combination of pipeline script + flags, " \
                             "sample subtype, and pipeline interface. " \
                             "'{}'\n{}".format(
                        strict_pipe_key, maybe_new_bundle, old_bundle)
                    raise ValueError(errmsg)

                # Add this bundle to the collection of ones relevant for the
                # current PipelineInterface.
                new_jobs.append(submission_bundle)

            job_submission_bundles.append(new_jobs)

        # Repeat logic check of short-circuit conditional to account for
        # edge case in which it's satisfied during the final iteration.
        if priority and len(job_submission_bundles) > 1:
            return job_submission_bundles[0]
        else:
            return list(itertools.chain(*job_submission_bundles))
Esempio n. 5
0
    def __call__(self):
        """ Do the summarization. """
        import csv

        columns = []
        stats = []
        objs = _pd.DataFrame()
        
        # First, the generic summarize will pull together all the fits
        # and stats from each sample into project-combined spreadsheets.
        # Create stats_summary file
        for sample in self.prj.samples:
            _LOGGER.info(self.counter.show(sample.sample_name,
                                           sample.protocol))
            sample_output_folder = sample_folder(self.prj, sample)

            # Grab the basic info from the annotation sheet for this sample.
            # This will correspond to a row in the output.
            sample_stats = sample.get_sheet_dict()
            columns.extend(sample_stats.keys())
            # Version 0.3 standardized all stats into a single file
            stats_file = os.path.join(sample_output_folder, "stats.tsv")
            if os.path.isfile(stats_file):
                _LOGGER.info("Using stats file: '%s'", stats_file)
            else:
                _LOGGER.warning("No stats file '%s'", stats_file)
                continue

            t = _pd.read_table(
                stats_file, header=None, names=['key', 'value', 'pl'])
            t.drop_duplicates(subset=['key', 'pl'], keep='last', inplace=True)
            # t.duplicated(subset= ['key'], keep = False)
            t.loc[:, 'plkey'] = t['pl'] + ":" + t['key']
            dupes = t.duplicated(subset=['key'], keep=False)
            t.loc[dupes, 'key'] = t.loc[dupes, 'plkey']
            sample_stats.update(t.set_index('key')['value'].to_dict())
            stats.append(sample_stats)
            columns.extend(t.key.tolist())

        self.counter.reset()

        # Create objects summary file
        for sample in self.prj.samples:
            # Process any reported objects
            _LOGGER.info(self.counter.show(sample.sample_name, sample.protocol))
            sample_output_folder = sample_folder(self.prj, sample)
            objs_file = os.path.join(sample_output_folder, "objects.tsv")
            if os.path.isfile(objs_file):
                _LOGGER.info("Using objects file: '%s'", objs_file)
            else:
                _LOGGER.warning("No objects file '%s'", objs_file)
                continue
            t = _pd.read_table(objs_file, header=None,
                               names=['key', 'filename', 'anchor_text',
                                      'anchor_image', 'annotation'])
            t['sample_name'] = sample.name
            objs = objs.append(t, ignore_index=True)
        
        tsv_outfile_path = os.path.join(self.prj.metadata.output_dir, self.prj.name)
        if hasattr(self.prj, "subproject") and self.prj.subproject:
            tsv_outfile_path += '_' + self.prj.subproject
        tsv_outfile_path += '_stats_summary.tsv'
        tsv_outfile = open(tsv_outfile_path, 'w')
        tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=uniqify(columns),
                                    delimiter='\t', extrasaction='ignore')     
        tsv_writer.writeheader()
        for row in stats:
            tsv_writer.writerow(row)
        tsv_outfile.close()

        _LOGGER.info(
            "Summary (n=" + str(len(stats)) + "): " + tsv_outfile_path)

        # Next, looper can run custom summarizers, if they exist.
        all_protocols = [sample.protocol for sample in self.prj.samples]

        _LOGGER.debug("Protocols: " + str(all_protocols))
        _LOGGER.debug(self.prj.interfaces_by_protocol)
        for protocol in set(all_protocols):
            try:
                ifaces = self.prj.interfaces_by_protocol[alpha_cased(protocol)]
            except KeyError:
                _LOGGER.warning("No interface for protocol '{}', skipping summary".
                             format(protocol))
                continue
            for iface in ifaces:
                _LOGGER.debug(iface)
                pl = iface.fetch_pipelines(protocol)
                summarizers = iface.get_attribute(pl, "summarizers")
                if summarizers is not None:
                    for summarizer in set(summarizers):
                        summarizer_abspath = os.path.join(
                            os.path.dirname(iface.pipe_iface_file), summarizer)
                        _LOGGER.debug([summarizer_abspath, self.prj.config_file])
                        try:
                            subprocess.call([summarizer_abspath, self.prj.config_file])
                        except OSError:
                            _LOGGER.warning("Summarizer was unable to run: " + str(summarizer))

        # Produce HTML report
        report_builder = HTMLReportBuilder(self.prj)
        report_path = report_builder(objs, stats, uniqify(columns))
        _LOGGER.info(
                "HTML Report (n=" + str(len(stats)) + "): " + report_path)
Esempio n. 6
0
    def __call__(self, args, remaining_args):
        """
        Do the Sample submission.

        :param argparse.Namespace args: parsed command-line options and
            arguments, recognized by looper
        :param list remaining_args: command-line options and arguments not
            recognized by looper, germane to samples/pipelines
        """

        protocols = {s.protocol for s in self.prj.samples
                     if hasattr(s, "protocol")}
        failures = defaultdict(list)  # Collect problems by sample.
        processed_samples = set()  # Enforce one-time processing.

        _LOGGER.info("Finding pipelines for protocol(s): {}".
                     format(", ".join(self.prj.protocols)))

        # Job submissions are managed on a per-pipeline basis so that
        # individual commands (samples) may be lumped into a single job.
        submission_conductors = {}
        pipe_keys_by_protocol = defaultdict(list)
        mapped_protos = set()
        for proto in protocols | {GENERIC_PROTOCOL_KEY}:
            proto_key = alpha_cased(proto)
            _LOGGER.debug("Determining sample type, script, and flags for "
                          "pipeline(s) associated with protocol: %s", proto)
            submission_bundles = self.prj.build_submission_bundles(proto_key)
            if not submission_bundles:
                if proto_key != GENERIC_PROTOCOL_KEY:
                    _LOGGER.warning("No mapping for protocol: '%s'", proto)
                continue
            mapped_protos.add(proto)
            for pl_iface, sample_subtype, pl_key, script_with_flags in \
                    submission_bundles:
                _LOGGER.debug("%s: %s", pl_key, sample_subtype.__name__)
                conductor = SubmissionConductor(
                        pl_key, pl_iface, script_with_flags, self.prj,
                        args.dry_run, args.time_delay, sample_subtype,
                        remaining_args, args.ignore_flags,
                        self.prj.compute,
                        max_cmds=args.lumpn, max_size=args.lump)
                submission_conductors[pl_key] = conductor
                pipe_keys_by_protocol[proto_key].append(pl_key)

        # Determine number of samples eligible for processing.
        num_samples = len(self.prj.samples)
        if args.limit is None:
            upper_sample_bound = num_samples
        elif args.limit < 0:
            raise ValueError(
                "Invalid number of samples to run: {}".format(args.limit))
        else:
            upper_sample_bound = min(args.limit, num_samples)
        _LOGGER.debug("Limiting to %d of %d samples",
                      upper_sample_bound, num_samples)

        num_commands_possible = 0
        failed_submission_scripts = []

        for sample in self.prj.samples[:upper_sample_bound]:
            # First, step through the samples and determine whether any
            # should be skipped entirely, based on sample attributes alone
            # and independent of anything about any of its pipelines.

            # Start by displaying the sample index and a fresh collection
            # of sample-skipping reasons.
            _LOGGER.info(self.counter.show(
                    sample.sample_name, sample.protocol))
            skip_reasons = []

            # Don't submit samples with duplicate names unless suppressed.
            if sample.sample_name in processed_samples:
                if args.allow_duplicate_names:
                    _LOGGER.warning("Duplicate name detected, but submitting anyway")
                else:
                    skip_reasons.append("Duplicate sample name")

            # Check if sample should be run.
            if sample.is_dormant():
                skip_reasons.append(
                        "Inactive status (via '{}' column/attribute)".
                        format(SAMPLE_EXECUTION_TOGGLE))

            # Get the base protocol-to-pipeline mappings.
            try:
                protocol = sample.protocol
            except AttributeError:
                skip_reasons.append("Sample has no protocol")
            else:
                if protocol not in mapped_protos and \
                        GENERIC_PROTOCOL_KEY not in mapped_protos:
                    skip_reasons.append("No pipeline for protocol")

            if skip_reasons:
                _LOGGER.warning(
                    "> Not submitted: {}".format(", ".join(skip_reasons)))
                failures[sample.name] = skip_reasons
                continue

            # Processing preconditions have been met.
            # Add this sample to the processed collection.
            processed_samples.add(sample.sample_name)

            # At this point, we have a generic Sample; write that to disk
            # for reuse in case of many jobs (pipelines) using base Sample.
            # Do a single overwrite here, then any subsequent Sample can be sure
            # that the file is fresh, with respect to this run of looper.
            sample.to_yaml(subs_folder_path=self.prj.metadata.submission_subdir)

            pipe_keys = pipe_keys_by_protocol.get(alpha_cased(sample.protocol)) \
                or pipe_keys_by_protocol.get(GENERIC_PROTOCOL_KEY)
            _LOGGER.debug("Considering %d pipeline(s)", len(pipe_keys))

            pl_fails = []
            for pl_key in pipe_keys:
                num_commands_possible += 1
                # TODO: of interest to track failures by pipeline?
                conductor = submission_conductors[pl_key]
                # TODO: check return value from add() to determine whether
                # TODO (cont.) to grow the failures list.
                try:
                    curr_pl_fails = conductor.add_sample(sample)
                except JobSubmissionException as e:
                    failed_submission_scripts.append(e.script)
                else:
                    pl_fails.extend(curr_pl_fails)
            if pl_fails:
                failures[sample.name].extend(pl_fails)

        job_sub_total = 0
        cmd_sub_total = 0
        for conductor in submission_conductors.values():
            conductor.submit(force=True)
            job_sub_total += conductor.num_job_submissions
            cmd_sub_total += conductor.num_cmd_submissions

        # Report what went down.
        max_samples = min(len(self.prj.samples), args.limit or float("inf"))
        _LOGGER.info("\nLooper finished")
        _LOGGER.info("Samples valid for job generation: %d of %d",
                     len(processed_samples), max_samples)
        _LOGGER.info("Successful samples: %d of %d",
                     max_samples - len(failures), max_samples)
        _LOGGER.info("Commands submitted: %d of %d",
                     cmd_sub_total, num_commands_possible)
        _LOGGER.info("Jobs submitted: %d", job_sub_total)
        if args.dry_run:
            _LOGGER.info("Dry run. No jobs were actually submitted.")

        # Restructure sample/failure data for display.
        samples_by_reason = defaultdict(set)
        # Collect names of failed sample(s) by failure reason.
        for sample, failures in failures.items():
            for f in failures:
                samples_by_reason[f].add(sample)
        # Collect samples by pipeline with submission failure.
        failed_samples_by_pipeline = defaultdict(set)
        for pl_key, conductor in submission_conductors.items():
            # Don't add failure key if there are no samples that failed for
            # that reason.
            if conductor.failed_samples:
                fails = set(conductor.failed_samples)
                samples_by_reason[SUBMISSION_FAILURE_MESSAGE] |= fails
                failed_samples_by_pipeline[pl_key] |= fails

        failed_sub_samples = samples_by_reason.get(SUBMISSION_FAILURE_MESSAGE)
        if failed_sub_samples:
            _LOGGER.info("\n{} samples with at least one failed job submission: {}".
                         format(len(failed_sub_samples),
                                ", ".join(failed_sub_samples)))

        # If failure keys are only added when there's at least one sample that
        # failed for that reason, we can display information conditionally,
        # depending on whether there's actually failure(s).
        if samples_by_reason:
            _LOGGER.info("\n{} unique reasons for submission failure: {}".format(
                len(samples_by_reason), ", ".join(samples_by_reason.keys())))
            full_fail_msgs = [create_failure_message(reason, samples)
                              for reason, samples in samples_by_reason.items()]
            _LOGGER.info("\nSummary of failures:\n{}".
                         format("\n".join(full_fail_msgs)))

        """
Esempio n. 7
0
    def fetch_sample_subtype(
            self, protocol, strict_pipe_key, full_pipe_path):
        """
        Determine the interface and Sample subtype for a protocol and pipeline.

        :param str protocol: name of the relevant protocol
        :param str strict_pipe_key: key for specific pipeline in a pipeline
            interface mapping declaration; this must exactly match a key in
            the PipelineInterface (or the Mapping that represent it)
        :param str full_pipe_path: (absolute, expanded) path to the
            pipeline script
        :return type: Sample subtype to use for jobs for the given protocol,
            that use the pipeline indicated
        :raises KeyError: if given a pipeline key that's not mapped in the
            pipelines section of this PipelineInterface
        """

        subtype = None

        this_pipeline_data = self.pipelines[strict_pipe_key]

        try:
            subtypes = this_pipeline_data[SUBTYPE_MAPPING_SECTION]
        except KeyError:
            _LOGGER.debug("Configuration (from %s) doesn't define section '%s' "
                          "for pipeline '%s'", self.source,
                          SUBTYPE_MAPPING_SECTION, strict_pipe_key)
            # Without a subtypes section, if pipeline module defines a single
            # Sample subtype, we'll assume that type is to be used when in
            # this case, when the interface section for this pipeline lacks
            # an explicit subtypes section specification.
            subtype_name = None
        else:
            if subtypes is None:
                # Designate lack of need for import attempt and provide
                # class with name to format message below.
                subtype = Sample
                _LOGGER.debug("Null %s subtype(s) section specified for "
                              "pipeline: '%s'; using base %s type",
                              subtype.__name__, strict_pipe_key,
                              subtype.__name__)
            elif isinstance(subtypes, str):
                subtype_name = subtypes
                _LOGGER.debug("Single subtype name for pipeline '%s' "
                              "in interface from '%s': '%s'", subtype_name,
                              strict_pipe_key, self.source)
            else:
                temp_subtypes = {
                        utils.alpha_cased(p): st for p, st in subtypes.items()}
                try:
                    subtype_name = temp_subtypes[utils.alpha_cased(protocol)]
                except KeyError:
                    # Designate lack of need for import attempt and provide
                    # class with name to format message below.
                    subtype = Sample
                    _LOGGER.debug("No %s subtype specified in interface from "
                                  "'%s': '%s', '%s'; known: %s",
                                  subtype.__name__, self.source,
                                  strict_pipe_key, protocol,
                                  ", ".join(temp_subtypes.keys()))

        # subtype_name is defined if and only if subtype remained null.
        # The import helper function can return null if the import attempt
        # fails, so provide the base Sample type as a fallback.
        subtype = subtype or \
                  _import_sample_subtype(full_pipe_path, subtype_name) or \
                  Sample
        _LOGGER.debug("Using Sample subtype: %s", subtype.__name__)
        return subtype