Beispiel #1
0
class BundleMercurialRepository(Task):

    task_namespace = "law.mercurial"

    exclude_files = CSVParameter(
        default=(), description="patterns of files to exclude, default: "
        "()")
    include_files = CSVParameter(
        default=(),
        description="patterns of files to force-include, "
        "takes precedence over .hgignore, default: ()")
    custom_checksum = luigi.Parameter(default=NO_STR,
                                      description="a custom checksum to use, "
                                      "default: NO_STR")

    def __init__(self, *args, **kwargs):
        super(BundleMercurialRepository, self).__init__(*args, **kwargs)

        self._checksum = None

    @abstractmethod
    def get_repo_path(self):
        return

    @property
    def checksum(self):
        if self.custom_checksum != NO_STR:
            return self.custom_checksum

        if self._checksum is None:
            checksum_script = rel_path(__file__, "scripts",
                                       "repository_checksum.sh")
            cmd = [checksum_script, self.get_repo_path()]

            code, out, _ = interruptable_popen(cmd, stdout=subprocess.PIPE)
            if code != 0:
                raise Exception("repository checksum calculation failed")

            self._checksum = out.strip()

        return self._checksum

    def output(self):
        repo_base = os.path.basename(self.get_repo_path())
        return LocalFileTarget("{}_{}.tgz".format(repo_base, self.checksum))

    @log
    def run(self):
        with self.output().localize("w") as tmp:
            self.bundle(tmp.path)

    def bundle(self, dst_path):
        bundle_script = rel_path(__file__, "scripts", "bundle_repository.sh")
        cmd = [bundle_script, self.get_repo_path(), get_path(dst_path)]
        cmd += [" ".join(self.exclude_files)]
        cmd += [" ".join(self.include_files)]

        code = interruptable_popen(cmd, executable="/bin/bash")[0]
        if code != 0:
            raise Exception("repository bundling failed")
Beispiel #2
0
class GLiteWorkflow(BaseRemoteWorkflow):

    workflow_proxy_cls = GLiteWorkflowProxy

    glite_workflow_run_decorators = None
    glite_job_manager_defaults = None
    glite_job_file_factory_defaults = None

    glite_ce = CSVParameter(default=(),
                            significant=False,
                            description="target glite computing "
                            "element(s), default: ()")

    glite_job_kwargs = []
    glite_job_kwargs_submit = ["glite_ce"]
    glite_job_kwargs_cancel = None
    glite_job_kwargs_cleanup = None
    glite_job_kwargs_query = None

    exclude_params_branch = {"glite_ce"}

    exclude_index = True

    @abstractmethod
    def glite_output_directory(self):
        return None

    @abstractmethod
    def glite_bootstrap_file(self):
        return None

    def glite_wrapper_file(self):
        return law_src_path("job", "bash_wrapper.sh")

    def glite_stageout_file(self):
        return None

    def glite_workflow_requires(self):
        return OrderedDict()

    def glite_output_postfix(self):
        self.get_branch_map()
        if self.branches:
            return "_" + "_".join(str(b) for b in sorted(self.branches))
        else:
            return "_{}To{}".format(self.start_branch, self.end_branch)

    def glite_output_uri(self):
        return self.glite_output_directory().url()

    def glite_delegate_proxy(self, endpoint):
        return delegate_voms_proxy_glite(endpoint,
                                         stdout=sys.stdout,
                                         stderr=sys.stderr,
                                         cache=True)

    def glite_create_job_manager(self, **kwargs):
        kwargs = merge_dicts(self.glite_job_manager_defaults, kwargs)
        return GLiteJobManager(**kwargs)

    def glite_create_job_file_factory(self, **kwargs):
        # job file fectory config priority: kwargs > class defaults
        kwargs = merge_dicts({}, self.glite_job_file_factory_defaults, kwargs)
        return GLiteJobFileFactory(**kwargs)

    def glite_job_config(self, config, job_num, branches):
        return config

    def glite_dump_intermediate_submission_data(self):
        return True

    def glite_post_submit_delay(self):
        return self.poll_interval * 60

    def glite_use_local_scheduler(self):
        return True

    def glite_cmdline_args(self):
        return []
Beispiel #3
0
class GLiteWorkflow(BaseRemoteWorkflow):

    workflow_proxy_cls = GLiteWorkflowProxy

    glite_workflow_run_decorators = None
    glite_job_manager_defaults = None
    glite_job_file_factory_defaults = None

    glite_ce = CSVParameter(
        default=(),
        significant=False,
        description="target glite computing element(s); default: empty",
    )

    glite_job_kwargs = []
    glite_job_kwargs_submit = ["glite_ce"]
    glite_job_kwargs_cancel = None
    glite_job_kwargs_cleanup = None
    glite_job_kwargs_query = None

    exclude_params_branch = {"glite_ce"}

    exclude_index = True

    @abstractmethod
    def glite_output_directory(self):
        return None

    @abstractmethod
    def glite_bootstrap_file(self):
        return None

    def glite_wrapper_file(self):
        return law_src_path("job", "bash_wrapper.sh")

    def glite_stageout_file(self):
        return None

    def glite_workflow_requires(self):
        return DotDict()

    def glite_output_postfix(self):
        return "_" + self.get_branches_repr()

    def glite_output_uri(self):
        return self.glite_output_directory().url()

    def glite_delegate_proxy(self, endpoint):
        return delegate_voms_proxy_glite(endpoint,
                                         stdout=sys.stdout,
                                         stderr=sys.stderr,
                                         cache=True)

    def glite_job_manager_cls(self):
        return GLiteJobManager

    def glite_create_job_manager(self, **kwargs):
        kwargs = merge_dicts(self.glite_job_manager_defaults, kwargs)
        return self.glite_job_manager_cls()(**kwargs)

    def glite_job_file_factory_cls(self):
        return GLiteJobFileFactory

    def glite_create_job_file_factory(self, **kwargs):
        # job file fectory config priority: kwargs > class defaults
        kwargs = merge_dicts({}, self.glite_job_file_factory_defaults, kwargs)
        return self.glite_job_file_factory_cls()(**kwargs)

    def glite_job_config(self, config, job_num, branches):
        return config

    def glite_use_local_scheduler(self):
        return True

    def glite_cmdline_args(self):
        return {}

    def glite_destination_info(self, info):
        return info
Beispiel #4
0
class BaseWorkflow(Task):

    workflow = luigi.Parameter(default=NO_STR, significant=False, description="the type of the "
        "workflow to use")
    acceptance = luigi.FloatParameter(default=1.0, significant=False, description="number of "
        "finished jobs to consider the task successful, relative fraction (<= 1) or absolute value "
        "(> 1), default: 1.0")
    tolerance = luigi.FloatParameter(default=0.0, significant=False, description="number of failed "
        "jobs to still consider the task successful, relative fraction (<= 1) or absolute value "
        "(> 1), default: 0.0")
    pilot = luigi.BoolParameter(significant=False, description="disable requirements of the "
        "workflow to let branch tasks resolve requirements on their own")
    branch = luigi.IntParameter(default=-1, description="the branch number/index to run this "
        "task for, -1 means this task is the workflow, default: -1")
    start_branch = luigi.IntParameter(default=NO_INT, description="the branch to start at, "
        "default: 0")
    end_branch = luigi.IntParameter(default=NO_INT, description="the branch to end at, NO_INT "
        "means end, default: NO_INT")
    branches = CSVParameter(cls=luigi.IntParameter, default=[], significant=False,
        description="branches to use")

    workflow_proxy_cls = BaseWorkflowProxy

    target_collection_cls = None
    outputs_siblings = False
    force_contiguous_branches = False

    workflow_property = None
    cached_workflow_property = None

    exclude_db = True
    exclude_params_branch = {"print_deps", "print_status", "remove_output", "workflow",
        "acceptance", "tolerance", "pilot", "start_branch", "end_branch", "branches"}
    exclude_params_workflow = {"branch"}

    def __init__(self, *args, **kwargs):
        super(BaseWorkflow, self).__init__(*args, **kwargs)

        # determine workflow proxy class to instantiate
        if self.is_workflow():
            classes = self.__class__.mro()
            for cls in classes:
                if not issubclass(cls, BaseWorkflow):
                    continue
                if self.workflow in (NO_STR, cls.workflow_proxy_cls.workflow_type):
                    self.workflow = cls.workflow_proxy_cls.workflow_type
                    self.workflow_proxy = cls.workflow_proxy_cls(task=self)
                    logger.debug("created workflow proxy instance of type '{}'".format(
                        cls.workflow_proxy_cls.workflow_type))
                    break
            else:
                raise ValueError("unknown workflow type {}".format(self.workflow))

            # cached attributes for the workflow
            self._branch_map = None
            self._branch_tasks = None

        else:
            # cached attributes for branches
            self._workflow_task = None

    def _forward_attribute(self, attr):
        return attr in _forward_attributes and self.is_workflow()

    def __getattribute__(self, attr, proxy=True, force=False):
        if proxy and attr != "__class__":
            if force or (attr != "_forward_attribute" and self._forward_attribute(attr)):
                return getattr(self.workflow_proxy, attr)

        return super(BaseWorkflow, self).__getattribute__(attr)

    def cli_args(self, exclude=None, replace=None):
        if exclude is None:
            exclude = set()

        if self.is_branch():
            exclude |= self.exclude_params_branch
        else:
            exclude |= self.exclude_params_workflow

        return super(BaseWorkflow, self).cli_args(exclude=exclude, replace=replace)

    def is_branch(self):
        return self.branch != -1

    def is_workflow(self):
        return not self.is_branch()

    def as_branch(self, branch=0):
        if self.is_branch():
            return self
        else:
            return self.req(self, branch=branch)

    def as_workflow(self):
        if self.is_workflow():
            return self
        else:
            if self._workflow_task is None:
                self._workflow_task = self.req(self, branch=NO_INT)
            return self._workflow_task

    @abstractmethod
    def create_branch_map(self):
        return

    def _reset_branch_boundaries(self, branches=None):
        if self.is_branch():
            raise Exception("calls to _reset_branch_boundaries are forbidden for branch tasks")

        if branches is None:
            branches = list(self._branch_map.keys())

        min_branch = min(branches)
        max_branch = max(branches)

        # reset start_branch
        self.start_branch = max(min_branch, min(max_branch, self.start_branch))

        # reset end_branch
        if self.end_branch < 0:
            self.end_branch = sys.maxsize
        self.end_branch = max(self.start_branch, min(max_branch + 1, self.end_branch))

    def _reduce_branch_map(self):
        if self.is_branch():
            raise Exception("calls to _reduce_branch_map are forbidden for branch tasks")

        # reduce by start/end branch
        for b in list(self._branch_map.keys()):
            if not (self.start_branch <= b < self.end_branch):
                del self._branch_map[b]

        # reduce by branches
        if self.branches:
            for b in list(self._branch_map.keys()):
                if b not in self.branches:
                    del self._branch_map[b]

    def get_branch_map(self, reset_boundaries=True, reduce=True):
        if self.is_branch():
            return self.as_workflow().get_branch_map(reset_boundaries=reset_boundaries,
                reduce=reduce)
        else:
            if self._branch_map is None:
                self._branch_map = self.create_branch_map()

                # some type and sanity checks
                if isinstance(self._branch_map, (list, tuple)):
                    self._branch_map = dict(enumerate(self._branch_map))
                elif self.force_contiguous_branches:
                    n = len(self._branch_map)
                    if set(self._branch_map.keys()) != set(range(n)):
                        raise ValueError("branch map keys must constitute contiguous range "
                            "[0, {})".format(n))
                else:
                    for branch in self._branch_map:
                        if not isinstance(branch, six.integer_types) or branch < 0:
                            raise ValueError("branch map keys must be non-negative integers, got "
                                "'{}' ({})".format(branch, type(branch).__name__))

                # post-process
                if reset_boundaries:
                    self._reset_branch_boundaries()
                if reduce:
                    self._reduce_branch_map()

            return self._branch_map

    @property
    def branch_map(self):
        return self.get_branch_map()

    @property
    def branch_data(self):
        if self.is_workflow():
            raise Exception("calls to branch_data are forbidden for workflow tasks")
        elif self.branch not in self.branch_map:
            raise ValueError("invalid branch '{}', not found in branch map".format(self.branch))

        return self.branch_map[self.branch]

    def get_branch_tasks(self):
        if self.is_branch():
            return self.as_workflow().get_branch_tasks()
        else:
            if self._branch_tasks is None:
                branch_map = self.branch_map
                if branch_map is None:
                    raise AttributeError("workflow task '{}' requires a branch_map".format(self))

                self._branch_tasks = OrderedDict()
                for b in branch_map:
                    self._branch_tasks[b] = self.req(self, branch=b,
                        _exclude=self.exclude_params_branch)

            return self._branch_tasks

    def workflow_requires(self):
        if self.is_branch():
            raise Exception("calls to workflow_requires are forbidden for branch tasks")

        return OrderedDict()

    def workflow_input(self):
        if self.is_branch():
            raise Exception("calls to workflow_input are forbidden for branch tasks")

        return luigi.task.getpaths(self.workflow_proxy.requires())

    def requires_from_branch(self):
        if self.is_branch():
            raise Exception("calls to requires_from_branch are forbidden for branch tasks")

        return self.__class__.requires(self)
Beispiel #5
0
Datei: base.py Projekt: yrath/law
class BaseWorkflow(Task):
    """
    Base class of all workflows.

    .. py:classattribute:: workflow
       type: luigi.Parameter

       Workflow type that refers to the workflow proxy implementation at instantiation / execution
       time. Empty default value.

    .. py:classattribute:: acceptance
       type: luigi.FloatParameter

       Number of complete tasks to consider the workflow successful. Values larger than one are
       interpreted as absolute numbers, and as fractions otherwise. Defaults to *1.0*.

    .. py:classattribute:: tolerance
       type: luigi.FloatParameter

       Number of failed tasks to still consider the workflow successful. Values larger than one are
       interpreted as absolute numbers, and as fractions otherwise. Defaults to *0.0*.

    .. py:classattribute:: branch
       type: luigi.IntParameter

       The branch number to run this task for. *-1* means that this task is the actual *workflow*,
       rather than a *branch* task. Defaults to *-1*.

    .. py:classattribute:: start_branch
       type: luigi.IntParameter

       First branch to process. Defaults to *0*.

    .. py:classattribute:: end_branch
       type: luigi.IntParameter

       First branch that is *not* processed (pythonic). Defaults to *-1*.

    .. py:classattribute:: branches
       type: law.CSVParameter

       Explicit list of branches to process. Empty default value.

    .. py:classattribute:: workflow_proxy_cls
       type: BaseWorkflowProxy

       Reference to the workflow proxy class associated to this workflow.

    .. py:classattribute:: workflow_complete
       type: None, callable

       Custom completion check that is used by the workflow's proxy when callable.

    .. py:classattribute:: output_collection_cls
       type: TargetCollection

       Configurable target collection class to use, such as
       :py:class:`target.collection.TargetCollection`, :py:class:`target.collection.FileCollection`
       or :py:class:`target.collection.SiblingFileCollection`.

    .. py:classattribute:: force_contiguous_branches
       type: bool

       Flag that denotes if this workflow is forced to use contiguous branch numbers, starting from
       0. If *False*, an exception is raised otherwise.

    .. py:classattribute:: workflow_property
       type: function

       Reference to :py:func:`workflow_property`.

    .. py:classattribute:: cached_workflow_property
       type: function

       Reference to :py:func:`cached_workflow_property`.

    .. py:classattribute:: workflow_run_decorators
       type: sequence, None

       Sequence of decorator functions that will be conveniently used to decorate the workflow
       proxy's run method. This way, there is no need to subclass and reset the
       :py:attr:`workflow_proxy_cls` just to add a decorator. The value is *None* by default.

    .. py:attribute:: workflow_cls
       type: law.task.Register

       Reference to the class of the realized workflow. This is especially helpful in case your
       derived class inherits from multiple workflows.

    .. py:attribute:: workflow_proxy
       type: BaseWorkflowProxy

       Reference to the underlying workflow proxy instance.

    .. py:attribute:: branch_map
       read-only
       type: dict

       Shorthand for :py:meth:`get_branch_map`.

    .. py:attribute:: branch_data
       read-only

       Shorthand for ``self.branch_map[self.branch]``.
    """

    workflow = luigi.Parameter(default=NO_STR, significant=False, description="the type of the "
        "workflow to use")
    acceptance = luigi.FloatParameter(default=1.0, significant=False, description="number of "
        "finished tasks to consider the task successful, relative fraction (<= 1) or absolute "
        "value (> 1), default: 1.0")
    tolerance = luigi.FloatParameter(default=0.0, significant=False, description="number of failed "
        "tasks to still consider the task successful, relative fraction (<= 1) or absolute value "
        "(> 1), default: 0.0")
    pilot = luigi.BoolParameter(significant=False, description="disable requirements of the "
        "workflow to let branch tasks resolve requirements on their own")
    branch = luigi.IntParameter(default=-1, description="the branch number/index to run this "
        "task for, -1 means this task is the workflow, default: -1")
    start_branch = luigi.IntParameter(default=NO_INT, description="the branch to start at, "
        "default: 0")
    end_branch = luigi.IntParameter(default=NO_INT, description="the branch to end at, NO_INT "
        "means end, default: NO_INT")
    branches = CSVParameter(default=[], significant=False, description="branches to use")

    workflow_proxy_cls = BaseWorkflowProxy

    workflow_complete = None

    output_collection_cls = None
    force_contiguous_branches = False

    workflow_property = None
    cached_workflow_property = None

    workflow_run_decorators = None

    exclude_index = True

    exclude_params_branch = {
        "workflow", "acceptance", "tolerance", "pilot", "start_branch", "end_branch", "branches",
    }
    exclude_params_workflow = {"branch"}

    def __init__(self, *args, **kwargs):
        super(BaseWorkflow, self).__init__(*args, **kwargs)

        # determine workflow proxy class to instantiate
        if self.is_workflow():
            classes = self.__class__.mro()
            for cls in classes:
                if not issubclass(cls, BaseWorkflow):
                    continue
                if not cls._defined_workflow_proxy:
                    continue
                if self.workflow in (NO_STR, cls.workflow_proxy_cls.workflow_type):
                    self.workflow = cls.workflow_proxy_cls.workflow_type
                    self.workflow_cls = cls
                    self.workflow_proxy = cls.workflow_proxy_cls(task=self)
                    logger.debug("created workflow proxy instance of type '{}'".format(
                        cls.workflow_proxy_cls.workflow_type))
                    break
            else:
                raise ValueError("unknown workflow type {}".format(self.workflow))

        # cached attributes for the workflow
        self._branch_map = None
        self._branch_tasks = None

        # cached attributes for branches
        self._workflow_task = None

    def __getattribute__(self, attr, proxy=True):
        return get_proxy_attribute(self, attr, proxy=proxy, super_cls=Task)

    def cli_args(self, exclude=None, replace=None):
        if exclude is None:
            exclude = set()

        if self.is_branch():
            exclude |= self.exclude_params_branch
        else:
            exclude |= self.exclude_params_workflow

        return super(BaseWorkflow, self).cli_args(exclude=exclude, replace=replace)

    def is_branch(self):
        """
        Returns whether or not this task refers to a *branch*.
        """
        return self.branch != -1

    def is_workflow(self):
        """
        Returns whether or not this task refers to the *workflow*.
        """
        return not self.is_branch()

    def as_branch(self, branch=0):
        """
        When this task refers to the workflow, a re-instantiated task with a certain *branch* and
        identical parameters is returned. Otherwise, the branch task itself is returned.
        """
        if self.is_branch():
            return self
        else:
            return self.req(self, branch=branch, _exclude=self.exclude_params_branch)

    def as_workflow(self):
        """
        When this task refers to a branch task, a re-instantiated task with ``branch=-1`` and
        identical parameters is returned. Otherwise, the workflow itself is returned.
        """
        if self.is_workflow():
            return self
        else:
            if self._workflow_task is None:
                self._workflow_task = self.req(self, branch=-1,
                    _exclude=self.exclude_params_workflow)
            return self._workflow_task

    @abstractmethod
    def create_branch_map(self):
        """
        Abstract method that must be overwritten by inheriting tasks to define the branch map.
        """
        return

    def _reset_branch_boundaries(self, branches=None):
        if self.is_branch():
            raise Exception("calls to _reset_branch_boundaries are forbidden for branch tasks")

        if branches is None:
            branches = list(self._branch_map.keys())

        min_branch = min(branches)
        max_branch = max(branches)

        # reset start_branch
        self.start_branch = max(min_branch, min(max_branch, self.start_branch))

        # reset end_branch
        if self.end_branch < 0:
            self.end_branch = sys.maxsize
        self.end_branch = max(self.start_branch, min(max_branch + 1, self.end_branch))

    def _reduce_branch_map(self):
        if self.is_branch():
            raise Exception("calls to _reduce_branch_map are forbidden for branch tasks")

        # reduce by start/end branch
        for b in list(self._branch_map.keys()):
            if not (self.start_branch <= b < self.end_branch):
                del self._branch_map[b]

        # reduce by branches
        if self.branches:
            # helper to expand slices, e.g. "1-3" -> 1,2,3 or "4-" -> 4,5,6,...
            def expand(b):
                if "-" in str(b):
                    parts = str(b).strip().split("-")
                    if len(parts) == 2:
                        start = int(parts[0]) if parts[0] else None
                        end = int(parts[1]) if parts[1] else None
                        return start, end
                return int(b)

            # determine branches to remove
            remove_branches = sorted(list(self._branch_map.keys()))
            for b in self.branches:
                b = expand(b)
                if isinstance(b, tuple):
                    start = b[0] if b[0] is not None else min(remove_branches)
                    end = b[1] if b[1] is not None else max(remove_branches)
                    for b in range(start, end + 1):
                        if b in remove_branches:
                            remove_branches.remove(b)
                else:
                    if b in remove_branches:
                        remove_branches.remove(b)

            # actual removal
            for b in remove_branches:
                del self._branch_map[b]

    def get_branch_map(self, reset_boundaries=True, reduce=True):
        """
        Creates and returns the branch map defined in :py:meth:`create_branch_map`. If
        *reset_boundaries* is *True*, the *start_branch* and *end_branch* attributes are rearranged
        to not exceed the actual branch map length. If *reduce* is *True* and an explicit list of
        branch numbers was set, the branch map is filtered accordingly. The branch map is cached.
        """
        if self.is_branch():
            return self.as_workflow().get_branch_map(reset_boundaries=reset_boundaries,
                reduce=reduce)
        else:
            if self._branch_map is None:
                self._branch_map = self.create_branch_map()

                # some type and sanity checks
                if isinstance(self._branch_map, (list, tuple)):
                    self._branch_map = dict(enumerate(self._branch_map))
                elif isinstance(self._branch_map, six.integer_types):
                    self._branch_map = dict(enumerate(range(self._branch_map)))
                elif self.force_contiguous_branches:
                    n = len(self._branch_map)
                    if set(self._branch_map.keys()) != set(range(n)):
                        raise ValueError("branch map keys must constitute contiguous range "
                            "[0, {})".format(n))
                else:
                    for branch in self._branch_map:
                        if not isinstance(branch, six.integer_types) or branch < 0:
                            raise ValueError("branch map keys must be non-negative integers, got "
                                "'{}' ({})".format(branch, type(branch).__name__))

                # post-process
                if reset_boundaries:
                    self._reset_branch_boundaries()
                if reduce:
                    self._reduce_branch_map()

            return self._branch_map

    @property
    def branch_map(self):
        return self.get_branch_map()

    @property
    def branch_data(self):
        if self.is_workflow():
            raise Exception("calls to branch_data are forbidden for workflow tasks")
        elif self.branch not in self.branch_map:
            raise ValueError("invalid branch '{}', not found in branch map".format(self.branch))

        return self.branch_map[self.branch]

    def get_branch_tasks(self):
        """
        Returns a dictionary that maps branch numbers to instantiated branch tasks. As this might be
        computationally intensive, the return value is cached.
        """
        if self.is_branch():
            return self.as_workflow().get_branch_tasks()
        else:
            if self._branch_tasks is None:
                branch_map = self.get_branch_map()
                if branch_map is None:
                    raise AttributeError("workflow task '{}' requires a branch_map".format(self))

                self._branch_tasks = OrderedDict()
                for b in branch_map:
                    self._branch_tasks[b] = self.req(self, branch=b,
                        _exclude=self.exclude_params_branch)

            return self._branch_tasks

    def workflow_requires(self):
        """
        Hook to add workflow requirements. This method is expected to return a dictionary. When
        this method is called from a branch task, an exception is raised.
        """
        if self.is_branch():
            raise Exception("calls to workflow_requires are forbidden for branch tasks")

        return OrderedDict()

    def workflow_input(self):
        """
        Returns the output targets if all workflow requirements, comparable to the normal
        ``input()`` method of plain tasks. When this method is called from a branch task, an
        exception is raised.
        """
        if self.is_branch():
            raise Exception("calls to workflow_input are forbidden for branch tasks")

        return luigi.task.getpaths(self.workflow_proxy.requires())

    def requires_from_branch(self):
        """
        Returns the requirements defined in the standard ``requires()`` method, but called in the
        context of the workflow. This method is only recommended in case all required tasks that
        would normally take a branch number, are intended to be instantiated with ``branch=-1``.
        When this method is called from a branch task, an exception is raised.
        """
        if self.is_branch():
            raise Exception("calls to requires_from_branch are forbidden for branch tasks")

        return self.__class__.requires(self)
Beispiel #6
0
class Task(BaseTask):

    log_file = luigi.Parameter(default=NO_STR,
                               significant=False,
                               description="a custom log file, "
                               "default: <task.default_log_file>")
    print_deps = CSVParameter(
        default=(),
        significant=False,
        description="print task dependencies "
        "but do not run any task; this CSV parameter accepts a single integer value which sets the "
        "task recursion depth (0 means non-recursive)")
    print_status = CSVParameter(
        default=(),
        significant=False,
        description="print the task status "
        "but do not run any task; this CSV parameter accepts up to three values: 1. the task "
        "recursion depth (0 means non-recursive), 2. the depth of the status text of target "
        "collections (default: 0), 3. a flag that is passed to the status text creation (default: "
        "'')")
    print_output = CSVParameter(
        default=(),
        significant=False,
        description="print a flat list of "
        "output targets but do not run any task; this CSV parameter accepts a single integer value "
        "which sets the task recursion depth (0 means non-recursive")
    remove_output = CSVParameter(
        default=(),
        significant=False,
        description="remove task outputs "
        "but do not run any task; this CSV parameter accepts up to three values: 1. the task "
        "recursion depth (0 means non-recursive), 2. one of the modes 'i' (interactive), 'a' "
        "(all), 'd' (dry run) (default: 'i'), 3. a flag that decides whether outputs of external "
        "tasks should be removed (default: False)")
    fetch_output = CSVParameter(
        default=(),
        significant=False,
        description="copy all task outputs "
        "into a local directory but do not run any task; this CSV parameter accepts up to four "
        "values: 1. the task recursion depth (0 means non-recursive), 2. one of the modes 'i' "
        "(interactive), 'a' (all), 'd' (dry run) (default: 'i'), 3. the target directory (default: "
        "'.'), 4. a flag that decides whether outputs of external tasks should be fetched "
        "(default: False)")

    interactive_params = [
        "print_deps",
        "print_status",
        "print_output",
        "remove_output",
        "fetch_output",
    ]

    message_cache_size = 10

    exclude_index = True
    exclude_params_req = set()
    exclude_params_repr = set()

    @classmethod
    def req_params(cls, inst, _exclude=None, _prefer_cli=None, **kwargs):
        _exclude = set() if _exclude is None else set(make_list(_exclude))

        # always exclude interactive parameters
        _exclude |= set(inst.interactive_params)

        return super(Task, cls).req_params(inst,
                                           _exclude=_exclude,
                                           _prefer_cli=_prefer_cli,
                                           **kwargs)

    def __init__(self, *args, **kwargs):
        super(Task, self).__init__(*args, **kwargs)

        # cache for messages published to the scheduler
        self._message_cache = []

        # cache for the last progress published to the scheduler
        self._last_progress_percentage = None

    @property
    def default_log_file(self):
        return "-"

    def is_root_task(self):
        return root_task() == self

    def publish_message(self, *args):
        msg = " ".join(str(arg) for arg in args)
        print(msg)
        sys.stdout.flush()

        self._publish_message(*args)

    def _publish_message(self, *args):
        msg = " ".join(str(arg) for arg in args)

        # add to message cache and handle overflow
        msg = uncolored(msg)
        self._message_cache.append(msg)
        if self.message_cache_size >= 0:
            end = max(len(self._message_cache) - self.message_cache_size, 0)
            del self._message_cache[:end]

        # set status message using the current message cache
        self.set_status_message("\n".join(self._message_cache))

    def create_message_stream(self, *args, **kwargs):
        return TaskMessageStream(self, *args, **kwargs)

    @contextmanager
    def publish_step(self,
                     msg,
                     success_message="done",
                     fail_message="failed",
                     runtime=False):
        self.publish_message(msg)
        success = False
        t0 = time.time()
        try:
            yield
            success = True
        finally:
            msg = success_message if success else fail_message
            if runtime:
                diff = time.time() - t0
                msg = "{} (took {})".format(msg, human_duration(seconds=diff))
            self.publish_message(msg)

    def publish_progress(self, percentage):
        percentage = int(math.floor(percentage))
        if percentage != self._last_progress_percentage:
            self._last_progress_percentage = percentage
            self.set_progress_percentage(percentage)

    def create_progress_callback(self, n_total, reach=(0, 100)):
        def make_callback(n, start, end):
            def callback(i):
                self.publish_progress(start + (i + 1) / float(n) *
                                      (end - start))

            return callback

        if isinstance(n_total, (list, tuple)):
            width = 100. / len(n_total)
            reaches = [(width * i, width * (i + 1))
                       for i in range(len(n_total))]
            return n_total.__class__(
                make_callback(n, *r) for n, r in zip(n_total, reaches))
        else:
            return make_callback(n_total, *reach)

    def cli_args(self, exclude=None, replace=None):
        exclude = set() if exclude is None else set(make_list(exclude))

        # always exclude interactive parameters
        exclude |= set(self.interactive_params)

        return super(Task, self).cli_args(exclude=exclude, replace=replace)

    def __repr__(self):
        return self.repr(color=False)

    def repr(self, all_params=False, color=None):
        if color is None:
            cfg = Config.instance()
            color = cfg.get_expanded_boolean("task", "colored_repr")

        family = self._repr_family(self.get_task_family(), color=color)

        parts = [
            self._repr_param(*pair, color=color)
            for pair in self._repr_params(all_params=all_params)
        ] + [
            self._repr_flag(flag, color=color) for flag in self._repr_flags()
        ]

        return "{}({})".format(family, ", ".join(parts))

    def colored_repr(self, all_params=False):
        # deprecation warning until v0.1
        logger.warning(
            "the use of {0}.colored_repr() is deprecated, please use "
            "{0}.repr(color=True) instead".format(self.__class__.__name__))

        return self.repr(all_params=all_params, color=True)

    def _repr_params(self, all_params=False):
        # build key value pairs of all significant parameters
        params = self.get_params()

        exclude = set()
        if not all_params:
            exclude |= self.exclude_params_repr
            exclude |= self.inst_exclude_params_repr()
            exclude |= set(self.interactive_params)

        pairs = []
        for name, param in params:
            if param.significant and not multi_match(name, exclude):
                value = getattr(self, name)
                pairs.append((name, param.serialize(value)))

        return pairs

    def _repr_flags(self):
        return []

    def inst_exclude_params_repr(self):
        return set()

    @classmethod
    def _repr_family(cls, family, color=False):
        return colored(family, "green") if color else family

    @classmethod
    def _repr_param(cls, name, value, color=False):
        return "{}={}".format(
            colored(name, color="blue", style="bright") if color else name,
            value)

    @classmethod
    def _repr_flag(cls, name, color=False):
        return colored(name, color="magenta") if color else name

    def _print_deps(self, args):
        return print_task_deps(self, *args)

    def _print_status(self, args):
        return print_task_status(self, *args)

    def _print_output(self, args):
        return print_task_output(self, *args)

    def _remove_output(self, args):
        return remove_task_output(self, *args)

    def _fetch_output(self, args):
        import law.target.remote as ltr
        with patch_object(ltr, "global_retries", 0, lock=True):
            return fetch_task_output(self, *args)

    def localize_input(self, *args, **kwargs):
        return localize_file_targets(self.input(), *args, **kwargs)

    def localize_output(self, *args, **kwargs):
        return localize_file_targets(self.output(), *args, **kwargs)
Beispiel #7
0
class ARCWorkflow(BaseRemoteWorkflow):

    workflow_proxy_cls = ARCWorkflowProxy

    arc_workflow_run_decorators = None
    arc_job_manager_defaults = None
    arc_job_file_factory_defaults = None

    arc_ce = CSVParameter(default=(),
                          significant=False,
                          description="target arc computing "
                          "element(s), default: ()")

    exclude_params_branch = {"arc_ce"}

    exclude_index = True

    @abstractmethod
    def arc_output_directory(self):
        return None

    @abstractmethod
    def arc_bootstrap_file(self):
        return None

    def arc_wrapper_file(self):
        return law_src_path("job", "bash_wrapper.sh")

    def arc_stageout_file(self):
        return None

    def arc_workflow_requires(self):
        return OrderedDict()

    def arc_output_postfix(self):
        self.get_branch_map()
        if self.branches:
            return "_" + "_".join(self.branches)
        else:
            return "_{}To{}".format(self.start_branch, self.end_branch)

    def arc_output_uri(self):
        return self.arc_output_directory().url()

    def arc_create_job_manager(self, **kwargs):
        kwargs = merge_dicts(self.arc_job_manager_defaults, kwargs)
        return ARCJobManager(**kwargs)

    def arc_create_job_file_factory(self, **kwargs):
        # job file fectory config priority: kwargs > class defaults
        kwargs = merge_dicts({}, self.arc_job_file_factory_defaults, kwargs)
        return ARCJobFileFactory(**kwargs)

    def arc_job_config(self, config, job_num, branches):
        return config

    def arc_dump_intermediate_submission_data(self):
        return True

    def arc_post_submit_delay(self):
        return self.poll_interval * 60

    def arc_use_local_scheduler(self):
        return True

    def arc_cmdline_args(self):
        return []
Beispiel #8
0
class ARCWorkflow(BaseRemoteWorkflow):

    workflow_proxy_cls = ARCWorkflowProxy

    arc_workflow_run_decorators = None
    arc_job_manager_defaults = None
    arc_job_file_factory_defaults = None

    arc_ce = CSVParameter(default=(),
                          significant=False,
                          description="target arc computing "
                          "element(s); default: empty")

    arc_job_kwargs = []
    arc_job_kwargs_submit = ["arc_ce"]
    arc_job_kwargs_cancel = None
    arc_job_kwargs_cleanup = None
    arc_job_kwargs_query = None

    exclude_params_branch = {"arc_ce"}

    exclude_index = True

    @abstractmethod
    def arc_output_directory(self):
        return None

    @abstractmethod
    def arc_bootstrap_file(self):
        return None

    def arc_wrapper_file(self):
        return law_src_path("job", "bash_wrapper.sh")

    def arc_stageout_file(self):
        return None

    def arc_workflow_requires(self):
        return DotDict()

    def arc_output_postfix(self):
        return "_" + self.get_branches_repr()

    def arc_output_uri(self):
        return self.arc_output_directory().url()

    def arc_create_job_manager(self, **kwargs):
        kwargs = merge_dicts(self.arc_job_manager_defaults, kwargs)
        return ARCJobManager(**kwargs)

    def arc_create_job_file_factory(self, **kwargs):
        # job file fectory config priority: kwargs > class defaults
        kwargs = merge_dicts({}, self.arc_job_file_factory_defaults, kwargs)
        return ARCJobFileFactory(**kwargs)

    def arc_job_config(self, config, job_num, branches):
        return config

    def arc_use_local_scheduler(self):
        return True

    def arc_cmdline_args(self):
        return {}
Beispiel #9
0
class Task(BaseTask):

    log_file = luigi.Parameter(default=NO_STR,
                               significant=False,
                               description="a custom log file, "
                               "default: <task.default_log_file>")
    print_deps = CSVParameter(
        default=[],
        significant=False,
        description="print task dependencies, "
        "do not run any task, the passed numbers set the recursion depth (0 means non-recursive)"
    )
    print_status = CSVParameter(
        default=[],
        significant=False,
        description="print the task status, "
        "do not run any task, the passed numbers set the recursion depth (0 means non-recursive) "
        "and optionally the collection depth")
    remove_output = CSVParameter(
        default=[],
        significant=False,
        description="remove all outputs, "
        "do not run any task, the passed number sets the recursion depth (0 means non-recursive)"
    )

    interactive_params = ["print_deps", "print_status", "remove_output"]

    message_cache_size = 10

    exclude_index = True
    exclude_params_req = set(interactive_params)

    def __init__(self, *args, **kwargs):
        super(Task, self).__init__(*args, **kwargs)

        # cache for messages published to the scheduler
        self._message_cache = []

        # cache for the last progress published to the scheduler
        self._last_progress_percentage = None

    @property
    def default_log_file(self):
        return "-"

    def publish_message(self, *args):
        msg = " ".join(str(arg) for arg in args)
        print(msg)
        sys.stdout.flush()

        self._publish_message(*args)

    def _publish_message(self, *args):
        msg = " ".join(str(arg) for arg in args)

        # add to message cache and handle overflow
        msg = uncolored(msg)
        self._message_cache.append(msg)
        if self.message_cache_size >= 0:
            end = max(len(self._message_cache) - self.message_cache_size, 0)
            del self._message_cache[:end]

        # set status message using the current message cache
        self.set_status_message("\n".join(self._message_cache))

    def create_message_stream(self, *args, **kwargs):
        return TaskMessageStream(self, *args, **kwargs)

    @contextmanager
    def publish_step(self, msg, success_message="done", fail_message="failed"):
        self.publish_message(msg)
        success = False
        try:
            yield
            success = True
        finally:
            self.publish_message(success_message if success else fail_message)

    def publish_progress(self, percentage, precision=0):
        percentage = round(percentage, precision)
        if percentage != self._last_progress_percentage:
            self._last_progress_percentage = percentage
            self.set_progress_percentage(percentage)

    def create_progress_callback(self, n_total, reach=(0, 100)):
        def make_callback(n, start, end):
            def callback(i):
                self.publish_progress(start + (i + 1) / float(n) *
                                      (end - start))

            return callback

        if isinstance(n_total, (list, tuple)):
            width = 100. / len(n_total)
            reaches = [(width * i, width * (i + 1))
                       for i in range(len(n_total))]
            return n_total.__class__(
                make_callback(n, *r) for n, r in zip(n_total, reaches))
        else:
            return make_callback(n_total, *reach)

    def colored_repr(self, color=True):
        family = self._repr_family(self.task_family, color=color)

        parts = [
            self._repr_param(*pair, color=color)
            for pair in self._repr_params(color=color)
        ]
        parts += [
            self._repr_flag(flag, color=color)
            for flag in self._repr_flags(color=color)
        ]

        return "{}({})".format(family, ", ".join(parts))

    def _repr_params(self, color=True):
        # build key value pairs of all significant parameters
        params = self.get_params()
        param_values = self.get_param_values(params, [], self.param_kwargs)
        param_objs = dict(params)

        pairs = []
        for param_name, param_value in param_values:
            if param_objs[param_name].significant:
                pairs.append((param_name,
                              param_objs[param_name].serialize(param_value)))

        return pairs

    def _repr_flags(self, color=True):
        return []

    @classmethod
    def _repr_family(cls, family, color=True):
        return colored(family, "green") if color else family

    @classmethod
    def _repr_param(cls, name, value, color=True):
        return "{}={}".format(
            colored(name, color="blue", style="bright") if color else name,
            value)

    @classmethod
    def _repr_flag(cls, name, color=True):
        return colored(name, color="magenta") if color else name

    def _print_deps(self, *args, **kwargs):
        return print_task_deps(self, *args, **kwargs)

    def _print_status(self, *args, **kwargs):
        return print_task_status(self, *args, **kwargs)

    def _remove_output(self, *args, **kwargs):
        return remove_task_output(self, *args, **kwargs)
Beispiel #10
0
class Task(six.with_metaclass(Register, BaseTask)):

    log_file = luigi.Parameter(default=NO_STR, significant=False, description="a custom log file; "
        "default: <task.default_log_file>")
    print_deps = CSVParameter(default=(), significant=False, description="print task dependencies "
        "but do not run any task; this CSV parameter accepts a single integer value which sets the "
        "task recursion depth (0 means non-recursive)")
    print_status = CSVParameter(default=(), significant=False, description="print the task status "
        "but do not run any task; this CSV parameter accepts up to three values: 1. the task "
        "recursion depth (0 means non-recursive), 2. the depth of the status text of target "
        "collections (default: 0), 3. a flag that is passed to the status text creation (default: "
        "'')")
    print_output = CSVParameter(default=(), significant=False, description="print a flat list of "
        "output targets but do not run any task; this CSV parameter accepts up to two values: 1. "
        "the task recursion depth (0 means non-recursive), 2. a boolean flag that decides whether "
        "paths of file targets should contain file system schemes (default: True)")
    remove_output = CSVParameter(default=(), significant=False, description="remove task outputs "
        "but do not run any task by default; this CSV parameter accepts up to three values: 1. the "
        "task recursion depth (0 means non-recursive), 2. one of the modes 'i' (interactive), 'a' "
        "(all), 'd' (dry run) (default: 'i'), 3. a boolean flag that decides whether the task is "
        "run after outputs were removed (default: False)")
    fetch_output = CSVParameter(default=(), significant=False, description="copy all task outputs "
        "into a local directory but do not run any task; this CSV parameter accepts up to four "
        "values: 1. the task recursion depth (0 means non-recursive), 2. one of the modes 'i' "
        "(interactive), 'a' (all), 'd' (dry run) (default: 'i'), 3. the target directory (default: "
        "'.'), 4. a boolean flag that decides whether external outputs and outputs of external "
        "tasks should be fetched (default: False)")

    interactive_params = [
        "print_deps", "print_status", "print_output", "remove_output", "fetch_output",
    ]

    # cache size for published messages
    message_cache_size = 10

    # force skipping this task when remove_output is set to "all" mode
    skip_output_removal = False

    exclude_index = True
    exclude_params_req = set()
    exclude_params_repr = set()

    @classmethod
    def req_params(cls, inst, _exclude=None, _prefer_cli=None, **kwargs):
        _exclude = set() if _exclude is None else set(make_list(_exclude))

        # always exclude interactive parameters
        _exclude |= set(inst.interactive_params)

        return super(Task, cls).req_params(inst, _exclude=_exclude, _prefer_cli=_prefer_cli,
            **kwargs)

    def __init__(self, *args, **kwargs):
        super(Task, self).__init__(*args, **kwargs)

        # cache for messages published to the scheduler
        self._message_cache = []

        # cache for the last progress published to the scheduler
        self._last_progress_percentage = None

    @property
    def default_log_file(self):
        return "-"

    def is_root_task(self):
        return root_task() == self

    def publish_message(self, msg, scheduler=True):
        msg = str(msg)

        sys.stdout.write(msg + "\n")
        sys.stdout.flush()

        if scheduler:
            self._publish_message(msg)

    def _publish_message(self, msg):
        msg = str(msg)

        # add to message cache and handle overflow
        msg = uncolored(msg)
        self._message_cache.append(msg)
        if self.message_cache_size >= 0:
            end = max(len(self._message_cache) - self.message_cache_size, 0)
            del self._message_cache[:end]

        # set status message using the current message cache
        if callable(getattr(self, "set_status_message", None)):
            self.set_status_message("\n".join(self._message_cache))
        else:
            logger.warning("set_status_message not set, cannot send task message to scheduler")

    def create_message_stream(self, *args, **kwargs):
        return TaskMessageStream(self, *args, **kwargs)

    @contextmanager
    def publish_step(self, msg, success_message="done", fail_message="failed", runtime=True,
            scheduler=True):
        self.publish_message(msg, scheduler=scheduler)
        success = False
        t0 = time.time()
        try:
            yield
            success = True
        finally:
            msg = success_message if success else fail_message
            if runtime:
                diff = time.time() - t0
                msg = "{} (took {})".format(msg, human_duration(seconds=diff))
            self.publish_message(msg, scheduler=scheduler)

    def publish_progress(self, percentage, precision=1):
        percentage = int(round_discrete(percentage, precision, "floor"))
        if percentage != self._last_progress_percentage:
            self._last_progress_percentage = percentage

            if callable(getattr(self, "set_progress_percentage", None)):
                self.set_progress_percentage(percentage)
            else:
                logger.warning("set_progress_percentage not set, cannot send task progress to "
                    "scheduler")

    def create_progress_callback(self, n_total, reach=(0, 100), precision=1):
        def make_callback(n, start, end):
            def callback(i):
                self.publish_progress(start + (i + 1) / float(n) * (end - start), precision)
            return callback

        if isinstance(n_total, (list, tuple)):
            width = 100. / len(n_total)
            reaches = [(width * i, width * (i + 1)) for i in range(len(n_total))]
            return n_total.__class__(make_callback(n, *r) for n, r in zip(n_total, reaches))
        else:
            return make_callback(n_total, *reach)

    def cli_args(self, exclude=None, replace=None):
        exclude = set() if exclude is None else set(make_list(exclude))

        # always exclude interactive parameters
        exclude |= set(self.interactive_params)

        return super(Task, self).cli_args(exclude=exclude, replace=replace)

    def __repr__(self):
        color = Config.instance().get_expanded_boolean("task", "colored_repr")
        return self.repr(color=color)

    def __str__(self):
        color = Config.instance().get_expanded_boolean("task", "colored_str")
        return self.repr(color=color)

    def repr(self, all_params=False, color=None, **kwargs):
        if color is None:
            color = Config.instance().get_expanded_boolean("task", "colored_repr")

        family = self._repr_family(self.get_task_family(), color=color, **kwargs)

        parts = [
            self._repr_param(name, value, color=color, **kwargs)
            for name, value in six.iteritems(self._repr_params(all_params=all_params))
        ] + [
            self._repr_flag(flag, color=color, **kwargs)
            for flag in self._repr_flags()
        ]

        return "{}({})".format(family, ", ".join(parts))

    def _repr_params(self, all_params=False):
        # determine parameters to exclude
        exclude = set()
        if not all_params:
            exclude |= self.exclude_params_repr
            exclude |= set(self.interactive_params)

        # build a map "name -> value" for all significant parameters
        params = OrderedDict()
        for name, param in self.get_params():
            if param.significant and not multi_match(name, exclude):
                params[name] = getattr(self, name)

        return params

    def _repr_flags(self):
        return []

    def _repr_family(self, family, color=False, **kwargs):
        return colored(family, "green") if color else family

    def _repr_param(self, name, value, color=False, serialize=True, **kwargs):
        # try to serialize first unless explicitly disabled
        if serialize:
            param = getattr(self.__class__, name, no_value)
            if param != no_value:
                value = param.serialize(value)

        return "{}={}".format(colored(name, color="blue", style="bright") if color else name, value)

    def _repr_flag(self, name, color=False, **kwargs):
        return colored(name, color="magenta") if color else name

    def _print_deps(self, args):
        return print_task_deps(self, *args)

    def _print_status(self, args):
        return print_task_status(self, *args)

    def _print_output(self, args):
        return print_task_output(self, *args)

    def _remove_output(self, args):
        return remove_task_output(self, *args)

    def _fetch_output(self, args):
        return fetch_task_output(self, *args)

    @classmethod
    def _law_run_inst(cls, inst, _exclude=None, _replace=None, _global=None, _run_kwargs=None):
        # get the cli arguments
        args = inst.cli_args(exclude=_exclude, replace=_replace)
        args = sum((make_list(tpl) for tpl in args.items()), [])

        # add global parameters when given
        if _global:
            args.extend([str(arg) for arg in make_list(_global)])

        # build the full command
        cmd = [cls.get_task_family()] + args

        # run it
        return law_run(cmd, **(_run_kwargs or {}))

    @classmethod
    def law_run_inst(cls, _exclude=None, _replace=None, _global=None, _run_kwargs=None, **kwargs):
        # create a new instance
        inst = cls(**kwargs)

        return cls._law_run_inst(inst, _exclude=_exclude, _replace=_replace, _global=_global,
            _run_kwargs=_run_kwargs)

    def law_run(self, _exclude=None, _replace=None, _global=None, _run_kwargs=None, **kwargs):
        # when kwargs are given, create a new instance
        inst = self.req(self, **kwargs) if kwargs else self

        return self._law_run_inst(inst, _exclude=_exclude, _replace=_replace, _global=_global,
            _run_kwargs=_run_kwargs)

    def localize_input(self, *args, **kwargs):
        return localize_file_targets(self.input(), *args, **kwargs)

    def localize_output(self, *args, **kwargs):
        return localize_file_targets(self.output(), *args, **kwargs)
Beispiel #11
0
class PlotScaleFactor(PlotTask):
    hist_name = "sf"

    shifts = CSVParameter(default=["*"], description="Systematic shifts to plot."
        " Allows globbing.")
    fix_normalization = FitScaleFactors.fix_normalization
    norm_to_nominal = luigi.BoolParameter()
    is_c_flavour = luigi.BoolParameter()

    b_taggers = CSVParameter(default=["deepcsv"])
    iterations = CSVParameter(default=[0])
    versions = CSVParameter(default=[None], description="Scale factor versions to compare."
        "The same version is used for all required tasks.")

    def __init__(self, *args, **kwargs):
        super(PlotScaleFactor, self).__init__(*args, **kwargs)
        # identifiers used in file names
        self.shifts_identifier = "_".join(self.shifts)
        self.file_identifiers = list(self.b_taggers) + [self.shifts_identifier]
        if self.is_c_flavour:
            self.file_identifiers.append("c")
        if self.norm_to_nominal:
            self.file_identifiers.append("normed")

        if self.is_c_flavour:
            all_shifts = MeasureCScaleFactors.shifts
        else:
            all_shifts = MeasureScaleFactors.shifts

        skip_shifts = ["nominal"] + list(jes_total_shifts)
        all_shifts = [shift for shift in all_shifts if shift not in skip_shifts]
        # get matching shifts
        self.shifts = [shift for shift in all_shifts if law.util.multi_match(shift, self.shifts)]

        # Check if multiple shifts are present and thus have to be combined (envelope)
        self.multiple_shifts = len(self.shifts) > 2

        if not self.is_c_flavour:
            # make sure the nominal histograms are processed first
            self.shifts.insert(0, "nominal")

        if len(self.shifts) != len(set(self.shifts)):
            raise Exception("Duplicate shift in {}".format(self.shifts))

    def requires(self):
        reqs = OrderedDict()
        measure_task = MeasureCScaleFactors if self.is_c_flavour else MeasureScaleFactors
        for config in itertools.product(self.b_taggers, self.iterations, self.versions):
            b_tagger, iteration, version = config

            reqs[config] = OrderedDict()
            for shift in self.shifts:
                reqs[config][shift] = {
                    "fit": FitScaleFactors.req(self, shift=shift, b_tagger=b_tagger, iteration=iteration,
                        version=version if version is not None else self.get_version(FitScaleFactors),
                        _prefer_cli=["version"]),
                    "hist": measure_task.req(self, shift=shift, b_tagger=b_tagger, iteration=iteration,
                        version=version if version is not None else self.get_version(measure_task),
                        _prefer_cli=["version"])
                    }
            if self.fix_normalization:
                reqs[config]["norm"] = MergeScaleFactorWeights.req(self, normalize_cerrs=self.is_c_flavour,
                    b_tagger=b_tagger, iteration=iteration,
                    version=version if version is not None else self.get_version(MergeScaleFactorWeights),
                    _prefer_cli=["version"])
        return reqs

    def output(self):
        filename = "plots_{}.tgz".format("_".join(self.file_identifiers))
        return self.local_target(filename)

    def run(self):
        import ROOT

        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        inp = self.input()
        outp = self.output()

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        plots = {}

        if self.norm_to_nominal and self.shifts[0] != "nominal":
            raise KeyError("'norm_to_nominal' is set to true, but no nominal values found.")

        for color_idx, (config, config_input) in enumerate(inp.items()):
            b_tagger, iteration, version = config

            config_ids = [b_tagger]
            if len(self.iterations) > 1:
                config_ids.append("iteration {}".format(iteration))
            if len(self.versions) > 1:
                config_ids.append("version {}".format(version))
            config_id = ", ".join(config_ids)

            nominal_hists = {}
            nominal_fit_hists = {}
            # combined errors for multiple shifts
            up_shifted_hists = defaultdict(dict)
            up_shifted_fit_hists = defaultdict(dict)
            down_shifted_hists = defaultdict(dict)
            down_shifted_fit_hists = defaultdict(dict)

            if self.fix_normalization:
                normalization_input = config_input.pop("norm")
            for shift_idx, (shift, inp_target) in enumerate(config_input.items()):
                # get scaling factors for normalization
                if self.fix_normalization:
                    norm_factors = normalization_input.load()[shift]

                with inp_target["fit"]["sf"].load("r") as fit_file, \
                        inp_target["hist"]["scale_factors"].load("r") as hist_file:
                    for category_key in fit_file.GetListOfKeys():
                        category_name = category_key.GetName()
                        if not self.config_inst.has_category(category_name):
                            raise KeyError("Unknown category {}".format(category_name))

                        category = self.config_inst.get_category(category_name)
                        pt_range = category.get_aux("pt")
                        eta_range = category.get_aux("eta")
                        region = category.get_aux("region")

                        # same category name for different b-taggers
                        if len(self.b_taggers) > 1:
                            plot_category = category.name.replace("__" + b_tagger, "")
                        else:
                            plot_category = category.name

                        fit_category_dir = fit_file.Get(category_name)
                        fit_hist = fit_category_dir.Get(self.hist_name)

                        hist_category_dir = hist_file.Get(category_name)
                        hist = hist_category_dir.Get(self.hist_name)
                        # truncate first bin
                        hist = self.rebin_hist(hist, region, b_tagger=b_tagger, truncate=True)

                        # normalize histogram if required
                        # fit histograms are already normalized in FitScaleFactors
                        if self.fix_normalization and not self.is_c_flavour:
                            hist.Scale(norm_factors[category_name])

                        if shift == "nominal":
                            # make sure histograms are not cleaned up when the file is closed
                            nominal_fit_hists[plot_category] = fit_hist.Clone()
                            nominal_fit_hists[plot_category].SetDirectory(0)

                            nominal_hists[plot_category] = hist.Clone()
                            nominal_hists[plot_category].SetDirectory(0)

                        # for c-jets, there is no nominal histogram
                        # Instead, all nominal values are set to 1
                        if shift_idx == 0 and self.is_c_flavour:
                            nominal_fit_hist = fit_hist.Clone()
                            for bin_idx in range(1, nominal_fit_hist.GetNbinsX() + 1):
                                nominal_fit_hist.SetBinContent(bin_idx, 1.0)

                            nominal_fit_hist.SetDirectory(0)
                            nominal_fit_hists[plot_category] = nominal_fit_hist

                        if shift != "nominal" and self.multiple_shifts:
                            # collect all shifted fit histograms to build envelope later
                            sys, direction = shift.rsplit("_", 1)
                            if direction == "up":
                                up_shifted_fit_hists[plot_category][sys] = fit_hist.Clone()
                                up_shifted_fit_hists[plot_category][sys].SetDirectory(0)
                                up_shifted_hists[plot_category][sys] = hist.Clone()
                                up_shifted_hists[plot_category][sys].SetDirectory(0)
                            elif direction == "down":
                                down_shifted_fit_hists[plot_category][sys] = fit_hist.Clone()
                                down_shifted_fit_hists[plot_category][sys].SetDirectory(0)
                                down_shifted_hists[plot_category][sys] = hist.Clone()
                                down_shifted_hists[plot_category][sys].SetDirectory(0)
                            else:
                                raise ValueError("Unknown direction {}".format(direction))

                        if self.norm_to_nominal:
                            fit_hist.Divide(nominal_fit_hists[plot_category])

                        # get same category key for all b-taggers
                        if plot_category in plots:
                            plot = plots[plot_category]
                        else:
                            plot = ROOTPlot(category.name, category.name)
                            plot.create_pads()
                            plots[plot_category] = plot
                        plot.cd(0, 0)
                        fit_hist.GetXaxis().SetRangeUser(-.1, 1.0)
                        y_min = 0.6 if self.norm_to_nominal else 0.
                        y_max = 1.4 if self.norm_to_nominal else 2.
                        fit_hist.GetYaxis().SetRangeUser(y_min, y_max)

                        if len(self.b_taggers) == 1:
                            title = self.config_inst.get_aux("btaggers")[b_tagger]["label"] + " discriminator"
                        else:
                            title = "B-Tag Discriminant"

                        fit_hist.GetXaxis().SetTitle(title)
                        fit_hist.GetYaxis().SetTitle("SF")

                        if shift_idx == 0:
                            if not self.multiple_shifts or shift == "nominal":
                                # only draw this fit histogram if it is not part of a shifted envelope
                                plot.draw({"sf": fit_hist}, line_color=1, add_to_legend=False)
                            line = ROOT.TLine(0., 0., 0., 2.)
                            line.SetLineStyle(9)
                            plot.draw({"line": line}, add_same_option=False, line_color=1, add_to_legend=False)

                            # add category information to plot
                            if not np.isinf(pt_range[1]):
                                text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \
                                    (pt_range[0], pt_range[1], eta_range[0], eta_range[1])
                            else:
                                text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \
                                    (pt_range[0], eta_range[0], eta_range[1])
                            plot.draw_text(text)
                        elif not self.multiple_shifts:
                            plot.draw({shift: fit_hist}, line_color=None)

                        if shift == "nominal" and not self.norm_to_nominal:
                            plot.draw({config_id + ", nominal": hist}, line_color=1,
                                add_to_legend=(len(self.shifts) != 1))

            if self.multiple_shifts:
                for plot_category in plots:
                    plot = plots[plot_category]
                    plot.cd(0, 0)

                    # build shifted histograms
                    fit_hist_down, fit_hist_up = build_hist_envelope(nominal_fit_hists[plot_category],
                        up_shifted_fit_hists[plot_category], down_shifted_fit_hists[plot_category],
                        envelope_as_errors=False)

                    hist_down, hist_up = build_hist_envelope(nominal_hists[plot_category],
                        up_shifted_hists[plot_category], down_shifted_hists[plot_category],
                        envelope_as_errors=False)

                    if self.norm_to_nominal:
                        fit_hist_up.Divide(nominal_fit_hists[plot_category])
                        fit_hist_down.Divide(nominal_fit_hists[plot_category])
                        hist_up.Divide(nominal_hists[plot_category])
                        hist_down.Divide(nominal_hists[plot_category])

                    plot.draw({config_id + ", up": fit_hist_up}, line_color=None)
                    plot.draw({config_id + ", down": fit_hist_down}, line_color=None)
                    plot.draw({config_id + ", up": hist_up}, line_color=2, options=["hist"])
                    plot.draw({config_id + ", down": hist_down}, line_color=4, options=["hist"])

        # save plots
        for plot_category in plots:
            plot = plots[plot_category]
            plot_name = self.get_plot_name(plot_category, self.shifts_identifier, self.b_taggers[0],
                self.iterations[0])
            plot.save(os.path.join(local_tmp.path, plot_name), draw_legend=True,
                lumi=self.config_inst.get_aux("lumi").values()[0] / 1000.)
            del plot

        with outp.localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
Beispiel #12
0
class PlotVariable(PlotTask):
    b_tagger = MergeHistograms.b_tagger
    iteration = MergeHistograms.iteration

    final_it = MergeHistograms.final_it

    category_tag = luigi.Parameter(default="merged")
    variable = CSVParameter(default=["jet{i_probe_jet}_{b_tag_var}_{region}_{shift}"],
        description="Variable to plot, or multiple variables that are filled into one histogram. "
        "{} accesses auxiliary information.")
    mc_split = luigi.ChoiceParameter(choices=["process", "flavor"], default="process")
    normalize = luigi.BoolParameter(description="Normalize MC histogram to data histogram")
    truncate = luigi.BoolParameter(description="Truncate the bin below zero, to be used "
        "for b-tag variable plots.")
    rebin = luigi.BoolParameter(description="Rebin variable to 'measurement' binning, only "
        "for b-tag variable plots. Not usable with category-optimized binning.")
    x_title = luigi.Parameter(default="", description="Title for the plot x-axis.")

    logarithmic = luigi.BoolParameter(description="Plot y axis with logarithmic scale.")
    draw_stacked = luigi.BoolParameter(description="Plot MC processes separated by *mc_split*, "
        "combined in a stack.")
    draw_systematics = luigi.BoolParameter(description="Draw envelope of systematic uncertainties.")

    mc_key = "mc"
    data_key = "data"

    def __init__(self, *args, **kwargs):
        super(PlotVariable, self).__init__(*args, **kwargs)
        if self.draw_systematics:
            self.shifts = [shift for shift in MeasureScaleFactors.shifts if not shift in jes_total_shifts]
            if self.final_it:
                self.shifts += MeasureCScaleFactors.shifts
        else:
            self.shifts = ["nominal"]

    def requires(self):
        reqs = {}

        reqs["hists"] = MergeHistograms.req(self, branch=0, version=self.get_version(MergeHistograms),
            _prefer_cli=["version"])

        if self.normalize:
            reqs["scale"] = MeasureScaleFactors.req(self, iteration=0,
                version=self.get_version(MeasureScaleFactors), _prefer_cli=["version"])

        return reqs

    def associate_hist(self, process=None, flavor=None, region=None):
        # associate hist either to data or monte carlo
        # returns *add_to_data*, *sign* (1. or -1.)
        if process.is_data:
            return True, 1.
        else:
            return False, 1.

    def run(self):
        def add_hist(hist, new_hist, sign=1.):
            if hist is None:
                hist = new_hist.Clone()
                hist.Scale(sign)
            else:
                hist.Add(new_hist, sign)
            return hist

        import ROOT

        ROOT.PyConfig.IgnoreCommandLineOptions = True
        ROOT.gROOT.SetBatch()

        inp = self.input()
        outp = self.output()

        if self.normalize:
            scales = inp["scale"]["channel_scales"].load()

        local_tmp = LocalDirectoryTarget(is_tmp=True)
        local_tmp.touch()

        categories = []
        for category, _, _ in self.config_inst.walk_categories():
            if category.has_tag((self.category_tag, self.b_tagger), mode=all):
                categories.append(category)

        # create plot objects
        plot_dict = {}
        for category in categories:
            plot = ROOTPlot(category.name, category.name)
            plot.create_pads(n_pads_y=2, limits_y=[0., 0.3, 1.0], legend_loc="upper")
            plot_dict[category] = plot

        with inp["hists"].load("r") as input_file:
            for category in categories:
                data_hist = None
                mc_hists = defaultdict(lambda: defaultdict(lambda: None)) # shift -> key (process/flavor)

                for leaf_cat, _, children in category.walk_categories():
                    # we are only interested in leaves
                    if children:
                        continue

                    flavor = leaf_cat.get_aux("flavor", None)
                    channel = leaf_cat.get_aux("channel")
                    region = leaf_cat.get_aux("region", None)

                    category_dir = input_file.GetDirectory(leaf_cat.name)
                    for process_key in category_dir.GetListOfKeys():
                        process = self.config_inst.get_process(process_key.GetName())
                        process_dir = category_dir.GetDirectory(process.name)

                        # avoid double counting of inclusive and flavor-dependent histograms
                        if flavor is not None:  # Not needed in case region isn't flavor specific
                            if process.is_data and flavor != "inclusive":
                                continue
                            elif process.is_mc and flavor == "inclusive":
                                continue

                        for shift in self.shifts:
                            if process.is_data and shift != "nominal":
                                continue
                            for variable in self.variable:
                                # create variable name from template
                                aux = leaf_cat.aux.copy()
                                aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["variable"]
                                aux["b_tagger"] = self.b_tagger
                                aux["shift"] = shift
                                variable = variable.format(**aux)

                                hist = process_dir.Get(variable)

                                binning_type = "measurement" if self.rebin else None
                                hist = self.rebin_hist(hist, region, binning_type=binning_type, truncate=self.truncate)

                                add_to_data, sign = self.associate_hist(process=process, flavor=flavor, region=region)
                                if add_to_data:
                                    if shift != "nominal":
                                        raise Exception("Cannot add shifted samples to data.")
                                    data_hist = add_hist(data_hist, hist, sign=sign)
                                else:
                                    if self.normalize and region is not None:  # apply "trigger" sfs as part of the normalization
                                        hist.Scale(scales[channel.name][region])

                                    key = process if self.mc_split == "process" else flavor
                                    mc_hists[shift][key] = add_hist(mc_hists[shift][key], hist, sign=sign)

                if self.normalize:  # normalize mc yield to data in this category
                    mc_yield = sum(hist.Integral() for hist in mc_hists["nominal"].values())
                    data_yield = data_hist.Integral()
                    norm_factor = data_yield / mc_yield
                    for shift in self.shifts:
                        for mc_hist in mc_hists[shift].values():
                            mc_hist.Scale(norm_factor)

                # get maximum value of hists/ stacks drawn to set axis ranges
                mc_hist_sum = mc_hists["nominal"].values()[0].Clone()

                for mc_hist in mc_hists["nominal"].values()[1:]:
                    mc_hist_sum.Add(mc_hist)
                hist_maximum = max([mc_hist_sum.GetMaximum(), data_hist.GetMaximum()])

                # get plot names
                mc_key = self.mc_key.format(**{"region": category.get_aux("region", None)})
                data_key = self.data_key.format(**{"region": category.get_aux("region", None)})

                plot = plot_dict[category]
                # data and mc histograms
                plot.cd(0, 1)
                if self.draw_stacked:
                    plot.draw(mc_hists["nominal"], stacked=True, stack_maximum=1.5*hist_maximum, y_title="Entries")
                else:
                    # fix axis range
                    invis_hist = mc_hist_sum.Clone() if mc_hist_sum.GetMaximum() > data_hist.GetMaximum() else data_hist.Clone()
                    invis_hist.Scale(1.5)
                    plot.draw({"invis": invis_hist}, invis=True)
                    plot.draw({mc_key: mc_hist_sum}, line_color=None)
                plot.draw({data_key: data_hist})

                if self.draw_systematics:
                    up_shifted_mc_hists = {}
                    down_shifted_mc_hists = {}
                    for shift in self.shifts:
                        # combine processes/ flavors
                        shifted_mc_hist_sum = mc_hists[shift].values()[0].Clone()
                        for mc_hist in mc_hists[shift].values()[1:]:
                            shifted_mc_hist_sum.Add(mc_hist)

                        if shift.endswith("_down"):
                            down_shifted_mc_hists[shift[:-5]] = shifted_mc_hist_sum.Clone()
                        elif shift.endswith("_up"):
                            up_shifted_mc_hists[shift[:-3]] = shifted_mc_hist_sum.Clone()

                    envelope = build_hist_envelope(mc_hist_sum, up_shifted_mc_hists,
                        down_shifted_mc_hists, envelope_as_errors=True)

                    plot.draw_as_graph(envelope, options="2", hatched=True)

                # add category information to plot
                pt_range = category.get_aux("pt", None)
                eta_range = category.get_aux("eta", None)
                if pt_range is not None and eta_range is not None:
                    if not np.isinf(pt_range[1]):
                        text = r"#splitline{%d < p_{T} < %d}{%.1f < |#eta| < %.1f}" % \
                            (pt_range[0], pt_range[1], eta_range[0], eta_range[1])
                    else:
                        text = r"#splitline{p_{T} > %d}{%.1f < |#eta| < %.1f}" % \
                            (pt_range[0], eta_range[0], eta_range[1])
                    plot.draw_text(text, size=0.05, xpos=0.505, ypos=0.5, align=11)

                # ratio of data to mc below the main plot
                plot.cd(0, 0)

                # ratio histograms
                # mc error band
                ratio_mcerr_hist = mc_hist_sum.Clone()
                # divide without error propagation
                self.divide_hists(ratio_mcerr_hist, mc_hist_sum)

                # ratio
                ratio_hist = data_hist.Clone()
                self.divide_hists(ratio_hist, mc_hist_sum)

                y_axis = ratio_hist.GetYaxis()
                y_axis.SetRangeUser(0.5, 1.5)
                y_axis.SetTitle("data/MC")
                y_axis.SetTitleSize(y_axis.GetTitleSize() * plot.open_pad.scale_factor)
                y_axis.SetLabelSize(y_axis.GetLabelSize() * plot.open_pad.scale_factor)
                y_axis.SetNdivisions(505)
                y_axis.SetTitleOffset(0.65)

                x_axis = ratio_hist.GetXaxis()
                if self.x_title:
                    aux = category.aux.copy()
                    aux["b_tag_var"] = self.config_inst.get_aux("btaggers")[self.b_tagger]["label"]
                    x_axis.SetTitle(self.x_title.format(**aux))

                x_axis.SetTitleSize(x_axis.GetTitleSize() * plot.open_pad.scale_factor)
                x_axis.SetLabelSize(x_axis.GetLabelSize() * plot.open_pad.scale_factor)

                plot.draw({"invis": ratio_hist}, invis=True)
                plot.draw_as_graph(ratio_mcerr_hist, options="2")
                plot.draw({"data/mc": ratio_hist})

                if self.draw_systematics:
                    # build envelope of ratio to nominal hist
                    for hist in up_shifted_mc_hists.values():
                        hist.Divide(mc_hist_sum)
                    for hist in down_shifted_mc_hists.values():
                        hist.Divide(mc_hist_sum)
                    scaled_envelope = build_hist_envelope(ratio_mcerr_hist, up_shifted_mc_hists,
                        down_shifted_mc_hists, envelope_as_errors=True)
                    plot.draw_as_graph(scaled_envelope, options="2", hatched=True)

        for category, plot in plot_dict.items():
            plot_name = self.get_plot_name(category.name, self.variable, self.b_tagger, self.iteration)
            plot.save(os.path.join(local_tmp.path, plot_name),
                draw_legend=(False, True), log_y=self.logarithmic,
                lumi=self.config_inst.get_aux("lumi").values()[0]/1000.)
            del plot

        with outp.localize("w") as tmp:
            with tarfile.open(tmp.path, "w:gz") as tar:
                for plot_file in os.listdir(local_tmp.path):
                    tar.add(os.path.join(local_tmp.path, plot_file), arcname=plot_file)
Beispiel #13
0
class WriteHistograms(DatasetTask, AnalysisSandboxTask, GridWorkflow,
                      law.LocalWorkflow):

    iteration = luigi.IntParameter(default=0,
                                   description="iteration of the scale factor "
                                   "calculation, starting at zero, default: 0")
    final_it = luigi.BoolParameter(
        description="Flag for the final iteration of the scale factor "
        "calculation.")
    variable_tags = CSVParameter(
        default=[],
        description="Only consider variables with one or more of "
        "the given tags. Use all if empty.")
    category_tags = CSVParameter(
        default=[],
        description="Only consider categories whose top-level "
        "category has one or more of the given tags. Use all if empty.")
    used_shifts = CSVParameter(
        default=[]
    )  # needs to be named differently from the wrapper task parameter
    binning = CSVParameter(
        default=[],
        cls=luigi.FloatParameter,
        description="Overwrite default binning "
        "of variables. If exactly three values are provided, they are interpreted as a tuple of (n_bins, min, max)."
    )

    b_tagger = luigi.Parameter(default="deepcsv",
                               description="Name of the b-tagger to use.")
    optimize_binning = luigi.BoolParameter(
        description="Use optimized discriminant binning.")

    file_merging = "trees"

    workflow_run_decorators = [law.decorator.notify]

    sandbox = "singularity::/cvmfs/singularity.opensciencegrid.org/cmssw/cms:rhel7-m20200612"
    req_sandbox = "slc7"

    def __init__(self, *args, **kwargs):
        super(WriteHistograms, self).__init__(*args, **kwargs)
        # set shifts
        if self.dataset_inst.is_data:
            shifts = {"nominal"}
        else:
            jes_sources = self.config_inst.get_aux("jes_sources_{}".format(
                self.config_inst.get_aux("jes_scheme")))
            shifts = {"nominal"} | format_shifts(jes_sources, prefix="jes")
            if self.iteration > 0:
                shifts = shifts | format_shifts([
                    "lf", "hf", "lf_stats1", "lf_stats2", "hf_stats1",
                    "hf_stats2"
                ])
                if self.final_it:  # add c shifts
                    shifts = shifts | format_shifts(["c_stats1", "c_stats2"])

        if len(self.used_shifts) == 0:
            self.shifts = shifts
        elif any([shift not in shifts for shift in self.used_shifts]):
            raise ValueError("Unknown shift in {}".format(self.used_shifts))
        else:
            self.shifts = self.used_shifts

    def workflow_requires(self):
        from analysis.tasks.measurement import BundleScaleFactors

        reqs = super(WriteHistograms, self).workflow_requires()

        if not self.cancel_jobs and not self.cleanup_jobs:
            reqs["meta"] = MergeMetaData.req(
                self,
                version=self.get_version(MergeMetaData),
                _prefer_cli=["version"])
            if self.dataset_inst.is_mc:
                reqs["pu"] = CalculatePileupWeights.req(self)
            if not self.pilot:
                reqs["tree"] = MergeTrees.req(
                    self,
                    cascade_tree=-1,
                    version=self.get_version(MergeTrees),
                    _prefer_cli=["version"])
            if self.iteration > 0:
                reqs["sf"] = BundleScaleFactors.req(
                    self,
                    iteration=self.iteration - 1,
                    fix_normalization=self.final_it,
                    include_cshifts=self.final_it,
                    version=self.get_version(BundleScaleFactors),
                    _prefer_cli=["version"])
            if self.optimize_binning:
                from analysis.tasks.util import OptimizeBinning  # prevent circular import
                reqs["binning"] = OptimizeBinning.req(
                    self,
                    version=self.get_version(OptimizeBinning),
                    _prefer_cli=["version"])

        return reqs

    def requires(self):
        from analysis.tasks.measurement import BundleScaleFactors

        reqs = {
            "tree":
            MergeTrees.req(self,
                           cascade_tree=self.branch,
                           branch=0,
                           version=self.get_version(MergeTrees),
                           _prefer_cli=["version", "workflow"]),
            "meta":
            MergeMetaData.req(self,
                              version=self.get_version(MergeMetaData),
                              _prefer_cli=["version"]),
        }
        if self.dataset_inst.is_mc:
            reqs["pu"] = CalculatePileupWeights.req(self)
        if self.iteration > 0:
            reqs["sf"] = BundleScaleFactors.req(
                self,
                iteration=self.iteration - 1,
                fix_normalization=self.final_it,
                include_cshifts=self.final_it,
                version=self.get_version(BundleScaleFactors),
                _prefer_cli=["version"])
        if self.optimize_binning:
            from analysis.tasks.util import OptimizeBinning  # prevent circular import
            reqs["binning"] = OptimizeBinning.req(
                self,
                version=self.get_version(OptimizeBinning),
                _prefer_cli=["version"])
        return reqs

    def store_parts(self):
        binning_part = "optimized" if self.optimize_binning else "default"
        variable_part = "_".join(
            self.variable_tags) if self.variable_tags else "all"
        shift_part = "_".join(self.used_shifts) if self.used_shifts else "all"
        return super(WriteHistograms, self).store_parts() + (self.b_tagger,) + (self.iteration,) \
            + (variable_part,) + (shift_part,) + (binning_part,)

    def output(self):
        return self.wlcg_target("hists_{}.root".format(self.branch))

    def get_jec_identifier(self, shift):
        if shift.startswith("jes"):
            return "_" + shift
        else:
            return ""

    def get_pileup_weighter(self, inp):
        with inp.load() as pu_file:
            pu_hist = pu_file.Get("pileup_weights")
            pu_values = [
                pu_hist.GetBinContent(i)
                for i in range(1,
                               pu_hist.GetNbinsX() + 1)
            ]
            pu_values = [
                value if (value < 1000) else 1. for value in pu_values
            ]  # TODO: Temporary, due to high pu weights in 2018 data

        def add_branch(extender):
            extender.add_branch("pu_weight", unpack="pu")

        def add_value(entry):
            # some events have inf pileup, skip them
            weight = 1.
            pu = entry.pu[0]
            if np.isfinite(pu):
                pu_idx = int(pu) - 1
                if 0 <= pu_idx < len(pu_values):
                    weight = pu_values[pu_idx]
            entry.pu_weight[0] = weight

        return add_branch, add_value

    def get_scale_factor_weighter(self, inp, shift, nominal_sfs=None):
        sf_hists = {}
        input_files = [inp]
        # c scale factor files have no histograms for hf/lf, so use nominal ones
        if nominal_sfs is not None:
            input_files.append(nominal_sfs)

        for input_file in input_files:
            with input_file.load() as sfs:
                shift_dir = sfs.Get(shift)
                for category in shift_dir.GetListOfKeys():
                    category_dir = shift_dir.Get(category.GetName())
                    hist = category_dir.Get("sf")
                    # decouple from open file
                    hist.SetDirectory(0)

                    if category.GetName() not in sf_hists:
                        sf_hists[category.GetName()] = hist
                    else:
                        raise KeyError("Duplicate category {} in scale factor "
                                       "weighter.".format(category.GetName()))

        btag_var = self.config_inst.get_aux("btaggers")[
            self.b_tagger]["variable"]
        identifier = self.get_jec_identifier(shift)

        def add_branch(extender):
            unpack_vars = sum([[
                "jet{}_pt{}".format(idx, identifier), "jet{}_flavor{}".format(
                    idx, identifier), "jet{}_eta{}".format(idx, identifier),
                "jet{}_{}{}".format(idx, btag_var, identifier)
            ] for idx in range(1, 5)], [])
            extender.add_branch("scale_factor_lf_{}".format(shift),
                                unpack=unpack_vars)
            extender.add_branch("scale_factor_c_{}".format(shift),
                                unpack=unpack_vars)
            extender.add_branch("scale_factor_hf_{}".format(shift),
                                unpack=unpack_vars)

        def add_value(entry):
            scale_factor_lf = 1.
            scale_factor_c = 1.
            scale_factor_hf = 1.
            for jet_idx in range(1, 5):
                jet_pt = getattr(entry,
                                 "jet{}_pt{}".format(jet_idx, identifier))[0]
                jet_eta = getattr(entry,
                                  "jet{}_eta{}".format(jet_idx, identifier))[0]
                jet_flavor = getattr(
                    entry, "jet{}_flavor{}".format(jet_idx, identifier))[0]
                jet_btag = getattr(
                    entry, "jet{}_{}{}".format(jet_idx, btag_var,
                                               identifier))[0]

                # stop when number of jets is exceeded
                if jet_flavor < -999.:
                    break

                # find category in which the scale factor of the jet was computed to get correct histogram
                if abs(jet_flavor) == 5:
                    region = "hf"
                elif abs(jet_flavor) == 4:
                    region = "c"
                else:
                    region = "lf"

                # nominal c scale factors are 1
                if region == "c" and not shift.startswith("c_stat"):
                    continue

                category = self.category_getter.get_category(
                    jet_pt, abs(jet_eta), region)

                # get scale factor
                sf_hist = sf_hists[category.name]
                bin_idx = sf_hist.FindBin(jet_btag)
                scale_factor = sf_hist.GetBinContent(bin_idx)
                scale_factor = max([0., scale_factor])

                if abs(jet_flavor) == 5:
                    scale_factor_hf *= scale_factor
                elif abs(jet_flavor) == 4:
                    scale_factor_c *= scale_factor
                else:
                    scale_factor_lf *= scale_factor

            getattr(entry,
                    "scale_factor_lf_{}".format(shift))[0] = scale_factor_lf
            getattr(entry,
                    "scale_factor_c_{}".format(shift))[0] = scale_factor_c
            getattr(entry,
                    "scale_factor_hf_{}".format(shift))[0] = scale_factor_hf

        return add_branch, add_value

    @law.decorator.notify
    def run(self):
        import ROOT

        inp = self.input()
        outp = self.output()
        outp.parent.touch(0o0770)

        self.category_getter = CategoryGetter(self.config_inst, self.b_tagger)

        # get child categories
        categories = []

        for category in self.config_inst.categories:
            # only consider top-level categories with at least one given tag if specified
            if len(self.category_tags) > 0 and not category.has_tag(
                    self.category_tags, mode=any):
                continue
            # for intermediate iterations, skip merged categories not used for measurement
            # (to improve performance)
            if not self.final_it:
                if category.has_tag("merged") and not category.get_aux(
                        "phase_space") == "measure":
                    continue
            # recurse through all children of category, add leaf categories
            for cat, children in walk_categories(category):
                if not children:
                    # only use categories matching the task config
                    if cat.get_aux("config", None) != self.config_inst.name:
                        continue
                    # only use categories for the chosen b-tag algorithm
                    if cat.has_tag(self.b_tagger):
                        channel = cat.get_aux("channel")
                        categories.append((channel, cat))

        categories = list(set(categories))

        # get processes
        if len(self.dataset_inst.processes) != 1:
            raise NotImplementedError(
                "only datasets with exactly one linked process can be"
                " handled, got {}".format(len(self.dataset_inst.processes)))
        processes = list(self.dataset_inst.processes.values())

        # build a progress callback
        progress = self.create_progress_callback(len(categories))

        # open the output file
        with outp.localize("w") as tmp:
            with tmp.dump("RECREATE") as output_file:
                with self.publish_step(
                        "creating root output file directories ..."):
                    process_dirs = {}
                    for _, category in categories:
                        output_file.cd()
                        category_dir = output_file.mkdir(category.name)
                        for process in processes:
                            category_dir.cd()
                            process_dir = category_dir.mkdir(process.name)
                            process_dir.Write()
                            process_dirs[(category.name,
                                          process.name)] = process_dir

                # open the input file and get the tree
                # as we need to extend the tree with custom weights, we do not cache the file
                with inp["tree"].load("UPDATE", cache=False) as input_file:
                    tree = input_file.Get("tree")
                    self.publish_message("{} events in tree".format(
                        tree.GetEntries()))

                    # identifier for jec shifted variables
                    for shift in self.shifts:
                        jec_identifier = self.get_jec_identifier(shift)

                        # pt aliases for jets
                        for obj in ["jet1", "jet2", "jet3", "jet4"]:
                            tree.SetAlias(
                                "{0}_pt{1}".format(obj, jec_identifier),
                                "({0}_px{1}**2 + {0}_py{1}**2)**0.5".format(
                                    obj, jec_identifier))
                        # b-tagging alias
                        btag_var = self.config_inst.get_aux("btaggers")[
                            self.b_tagger]["variable"]
                        for obj in ["jet1", "jet2", "jet3", "jet4"]:
                            variable = self.config_inst.get_variable(
                                "{0}_{1}".format(obj, btag_var))
                            tree.SetAlias(
                                variable.name + jec_identifier,
                                variable.expression.format(
                                    **{"jec_identifier": jec_identifier}))
                    # pt aliases for leptons
                    for obj in ["lep1", "lep2"]:
                        tree.SetAlias(
                            "{0}_pt".format(obj),
                            "({0}_px**2 + {0}_py**2)**0.5".format(obj))

                    # extend the tree
                    if self.dataset_inst.is_mc:
                        with self.publish_step(
                                "extending the input tree with weights ..."):
                            weighters = []

                            # pileup weight
                            weighters.append(
                                self.get_pileup_weighter(inp["pu"]))

                            # weights from previous iterations
                            if self.iteration > 0:
                                # b-tagging scale factors
                                for shift in self.shifts:
                                    nominal_sfs = inp["sf"]["nominal"]["sf"] if shift.startswith("c_stat") \
                                        else None
                                    weighters.append(
                                        self.get_scale_factor_weighter(
                                            inp["sf"],
                                            shift,
                                            nominal_sfs=nominal_sfs))

                            input_file.cd()
                            with TreeExtender(tree) as te:
                                for add_branch, _ in weighters:
                                    add_branch(te)
                                for i, entry in enumerate(te):
                                    if (i % 1000) == 0:
                                        print "event {}".format(i)
                                    for _, add_value in weighters:
                                        add_value(entry)

                        # read in total number of events
                        sum_weights = inp["meta"].load(
                        )["event_weights"]["sum"]

                    # get category-dependent binning if optimized binning is used
                    # only for b-taaging discriminants
                    if self.optimize_binning:
                        category_binnings = inp["binning"].load()

                    for i, (channel, category) in enumerate(categories):
                        self.publish_message(
                            "writing histograms in category {} ({}/{})".format(
                                category.name, i + 1, len(categories)))

                        # get the region (HF / LF)
                        # not all child categories have regions associated, e.g. the phase space
                        # inclusive regions ("measure", "closure")
                        region = category.get_aux("region", None)

                        # set weights that are common for all shifts
                        base_weights = []
                        if self.dataset_inst.is_mc:
                            base_weights.append("gen_weight")
                            # lumi weight
                            lumi = self.config_inst.get_aux("lumi")[channel]
                            x_sec = process.get_xsec(
                                self.config_inst.campaign.ecm).nominal
                            lumi_weight = lumi * x_sec / sum_weights
                            base_weights.append(str(lumi_weight))

                            # pu weight
                            base_weights.append("pu_weight")

                        for process in processes:
                            # change into the correct directory
                            process_dirs[(category.name, process.name)].cd()
                            for shift in self.shifts:
                                jec_identifier = self.get_jec_identifier(shift)

                                # weights
                                weights = base_weights[:]
                                if self.dataset_inst.is_mc:
                                    # channel scale weight
                                    if self.iteration > 0:
                                        # b-tag scale factor weights
                                        phase_space = category.get_aux(
                                            "phase_space", None)
                                        # In measurement categories,
                                        # apply scale factors only for contamination
                                        if phase_space == "measure" and not self.final_it:
                                            weights.append(
                                                "scale_factor_c_{}".format(
                                                    shift))
                                            if region == "hf":
                                                weights.append(
                                                    "scale_factor_lf_{}".
                                                    format(shift))
                                            elif region == "lf":
                                                weights.append(
                                                    "scale_factor_hf_{}".
                                                    format(shift))
                                            elif region == "cont":
                                                weights.append(
                                                    "scale_factor_lf_{}".
                                                    format(shift))
                                                weights.append(
                                                    "scale_factor_hf_{}".
                                                    format(shift))
                                            else:
                                                raise ValueError(
                                                    "Unexpected region {}".
                                                    format(region))
                                        else:
                                            weights.append(
                                                "scale_factor_lf_{}".format(
                                                    shift))
                                            weights.append(
                                                "scale_factor_c_{}".format(
                                                    shift))
                                            weights.append(
                                                "scale_factor_hf_{}".format(
                                                    shift))

                                # totalWeight alias
                                while len(weights) < 2:
                                    weights.insert(0, "1")
                                tree.SetAlias(
                                    "totalWeight",
                                    join_root_selection(weights, op="*"))

                                # actual projecting
                                for variable in self.config_inst.variables:
                                    # save variable binning to reset at end of loop
                                    base_variable_binning = variable.binning

                                    if variable.has_tag("skip_all"):
                                        continue
                                    if region and variable.has_tag(
                                            "skip_{}".format(region)):
                                        continue
                                    # if variable tags is given, require at least one
                                    if len(self.variable_tags
                                           ) > 0 and not variable.has_tag(
                                               self.variable_tags, mode=any):
                                        continue
                                    # do not write one b-tag discriminant in the category of another
                                    if variable.get_aux(
                                            "b_tagger",
                                            self.b_tagger) != self.b_tagger:
                                        continue

                                    # if number of bins is specified, overwrite variable binning
                                    if self.binning:
                                        self.binning = list(self.binning)
                                        # if a tuple of (n_bins, x_min, x_max) is given, ensure that n_bins is an integer
                                        if len(self.binning) == 3:
                                            self.binning[0] = int(
                                                self.binning[0])
                                            self.binning = tuple(self.binning)

                                        variable.binning = self.binning

                                    # use optimized binning for b-tag discriminants if provided
                                    if self.optimize_binning and variable.get_aux(
                                            "can_optimize_bins", False):
                                        binning_category = category.get_aux(
                                            "binning_category", category)
                                        # overwrite binning if specialized binning is defined for this category
                                        variable.binning = category_binnings.get(
                                            binning_category.name,
                                            variable.binning)

                                    hist = ROOT.TH1F(
                                        "{}_{}".format(variable.name, shift),
                                        variable.full_title(root=True),
                                        variable.n_bins,
                                        array.array("d", variable.bin_edges))
                                    hist.Sumw2()

                                    # build the full selection string, including the total event weight
                                    selection = [
                                        category.selection,
                                        "jetmet_pass{jec_identifier} == 1",
                                        "{} != -10000".format(
                                            variable.expression),
                                    ]
                                    if variable.selection:
                                        selection.append(variable.selection)
                                    selection = join_root_selection(
                                        selection).format(
                                            **
                                            {"jec_identifier": jec_identifier})
                                    selection = join_root_selection(
                                        selection, "totalWeight", op="*")

                                    # project and write the histogram
                                    tree.Project(
                                        "{}_{}".format(variable.name, shift),
                                        variable.expression.format(
                                            **
                                            {"jec_identifier": jec_identifier
                                             }), selection)
                                    hist.Write()
                                    variable.binning = base_variable_binning

                        progress(i)