Beispiel #1
0
    def write(self, filename, name='dax'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.
        """
        dax = ADAG(name)

        # Add files to DAX-level replica catalog.
        catalog = {}
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            f = File(attrs['lfn'])

            # Add physical file names, if any.
            urls = attrs.get('urls')
            if urls is not None:
                sites = attrs.get('sites')
                if sites is None:
                    sites = ','.join(len(urls) * ['local'])
                for url, site in zip(urls.split(','), sites.split(',')):
                    f.addPFN(PFN(url, site))

            catalog[attrs['lfn']] = f
            dax.addFile(f)

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            job = Job(name=attrs['name'], id=task_id)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('args')
            if args is not None and args:
                args = args.split()
                lfns = list(set(catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.OUTPUT)

                streams = attrs.get('streams')
                if streams is not None:
                    if streams & 1 != 0:
                        job.setStdout(f)
                    if streams & 2 != 0:
                        job.setStderr(f)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)
Beispiel #2
0
    def write_dax(self, filename='workflow.dax', name='workflow'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.

        Raises
        ------
        `ValueError`
            If either task or file node is missing mandatory attribute.
        """
        dax = ADAG(name)

        # Process file nodes.
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            try:
                name = attrs['lfn']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('lfn'))
            file_ = File(name)

            # Add physical file names, if any.
            urls = attrs.get('pfn')
            if urls is not None:
                urls = urls.split(',')
                sites = attrs.get('sites')
                if sites is None:
                    sites = len(urls) * ['condorpool']
                for url, site in zip(urls, sites):
                    file_.addPFN(PFN(url, site))

            self.catalog[attrs['lfn']] = file_

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            try:
                name = attrs['exec_name']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('exec_name'))
            label = '{name}_{id}'.format(name=name, id=task_id)
            job = Job(name, id=task_id, node_label=label)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('exec_args', [])
            if args:
                args = args.split()
                lfns = list(set(self.catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = self.catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.OUTPUT)

                    streams = attrs.get('streams')
                    if streams is not None:
                        if streams & 1 != 0:
                            job.setStdout(file_)
                        if streams & 2 != 0:
                            job.setStderr(file_)

            # Provide default files to store stderr and stdout, if not
            # specified explicitly.
            if job.stderr is None:
                file_ = File('{name}.out'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStderr(file_)
            if job.stdout is None:
                file_ = File('{name}.err'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStdout(file_)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)