Example #1
0
def path_to_pegasus_file(path: Path,
                         *,
                         site: str = "local",
                         name: Optional[str] = None,
                         is_raw_input: bool = False) -> File:
    """
    Given a *path* object return a pegasus `File` for usage in a workflow
    If the resource is not on a local machine provide the *site* string.
    Files can be used for either an input or output of a Job.

    Args:
        path: path to the file
        site: site to be used, default is local. Should be set to saga if running
        on cluster.
        name: name given to the file
        is_raw_input: indicates that the file doesn't come from the output of another
        job in the workflow, so can be safely added to the Pegasus DAX
    Returns:
        Pegasus File at the given path

    """
    rtnr = File(name if name else str(path.absolute()).replace("/", "-"))
    if is_raw_input:
        rtnr.addPFN(path_to_pfn(path, site=site))
    return rtnr
Example #2
0
def registerFile(workflow, filename):
    """
	2011.12.13
		function to register any file to the workflow.input_site_handler, 
	"""
    file = File(os.path.basename(filename))
    file.addPFN(PFN("file://" + os.path.abspath(filename), \
           workflow.input_site_handler))
    workflow.addFile(file)
    return file
Example #3
0
 def createFile (self, fileName, fileURL=None, site=None):
     #traceback.print_stack ()
     file = self.getFile (fileName)
     if not file:
         file = File (fileName)
         if not fileURL:
             fileURL = "file://%s/%s" % (self.wms.getOutputDir (), fileName)
             logger.debug ("fileurl: %s", fileURL)
         if not site:
             site = "local"
         if not isinstance(fileURL, basestring) and len (fileURL) > 0:
             fileURL = fileURL [0]
         logger.debug ("--add-pfn: (%s)(%s)(%s)", fileName, fileURL, site)
         pfn = PFN (fileURL, site)
         file.addPFN (pfn)
         self.files [fileName] = file
     return file
Example #4
0
    def write(self, filename, name='dax'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.
        """
        dax = ADAG(name)

        # Add files to DAX-level replica catalog.
        catalog = {}
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            f = File(attrs['lfn'])

            # Add physical file names, if any.
            urls = attrs.get('urls')
            if urls is not None:
                sites = attrs.get('sites')
                if sites is None:
                    sites = ','.join(len(urls) * ['local'])
                for url, site in zip(urls.split(','), sites.split(',')):
                    f.addPFN(PFN(url, site))

            catalog[attrs['lfn']] = f
            dax.addFile(f)

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            job = Job(name=attrs['name'], id=task_id)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('args')
            if args is not None and args:
                args = args.split()
                lfns = list(set(catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                f = catalog[attrs['lfn']]
                job.uses(f, link=Link.OUTPUT)

                streams = attrs.get('streams')
                if streams is not None:
                    if streams & 1 != 0:
                        job.setStdout(f)
                    if streams & 2 != 0:
                        job.setStderr(f)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)
Example #5
0
})
config.read(sys.argv[2] + '/test.config')

# Create an abstract dag
cluster = ADAG(config.get('all', 'workflow_name'))

input_file = config.get('all', 'input_file')
if (input_file == ''):
    input_file = os.getcwd()
else:
    input_file += '/' + os.getenv('USER') + '/inputs'

# Add input file to the DAX-level replica catalog
a = File("f.a")
a.addPFN(
    PFN(
        config.get('all', 'file_url') + input_file + "/f.a",
        config.get('all', 'file_site')))
cluster.addFile(a)

for i in range(1, 3):
    sleep = Executable(namespace="cluster",
                       name="level" + str(i),
                       version="1.0",
                       os="linux",
                       arch="x86_64",
                       installed=config.getboolean('all',
                                                   'executable_installed'))
    sleep.addPFN(
        PFN(
            config.get('all', 'executable_url') + sys.argv[1] +
            "/bin/pegasus-keg", config.get('all', 'executable_site')))
Example #6
0
config = ConfigParser.ConfigParser({'input_file':'', 'workflow_name':'horizontal-clustering-test', 'executable_installed':"False", 'clusters_size':"3", 'clusters_maxruntime':"7"})
config.read(sys.argv[2] + '/test.config')

# Create an abstract dag
cluster = ADAG (config.get('all', 'workflow_name'))

input_file = config.get('all', 'input_file')
if (input_file == ''):
        input_file = os.getcwd ()
else:
        input_file += '/' + os.getenv ('USER') + '/inputs'

# Add input file to the DAX-level replica catalog
a = File("f.a")
a.addPFN(PFN(config.get('all', 'file_url') + input_file + "/f.a", config.get('all', 'file_site')))
cluster.addFile(a)

for i in range (1, 3):
    sleep = Executable (namespace = "cluster", name = "level" + str (i), version = "1.0", os = "linux", arch = "x86", installed=config.getboolean('all', 'executable_installed'))
    sleep.addPFN (PFN (config.get('all', 'executable_url') + sys.argv[1] + "/bin/pegasus-keg", config.get('all', 'executable_site')))
    sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.size", value = config.get('all', 'clusters_size')))
    sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.maxruntime", value = config.get('all', 'clusters_maxruntime')))
    cluster.addExecutable(sleep)

for i in range (4):
    job = Job (namespace = "cluster", name = "level1", version = "1.0")
    job.addArguments('-a level1 -T ' + str (i + 1))
    job.addArguments('-i', a)
    job.addProfile (Profile (namespace = "pegasus", key = "job.runtime", value = str (i + 1)))
    job.uses(a, link=Link.INPUT)
Example #7
0
def registerRefFastaFile(workflow=None, refFastaFname=None, registerAffiliateFiles=True, input_site_handler='local',\
      checkAffiliateFileExistence=True, addPicardDictFile=True,\
      affiliateFilenameSuffixLs=['fai', 'amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa', \
      'stidx', 'sthash'], folderName="reference"):
    """
	suffix here doesn't include ".".
	
	2013.08.23 bugfix, check if workflow has a file registered before adding it
	2013.3.26 added refSAMtoolsFastaIndexF, refPicardFastaDictF into returnData
	2013.3.20 deduce needBWARefIndexJob, needSAMtoolsFastaIndexJob, needPicardFastaDictJob, needStampyRefIndexJob from missing suffixes
	2010.10.10 added argument folderName
	2012.5.23
		add an argument "addPicardDictFile" to offer user option to exclude this file (i.e. in registerBlastNucleotideDatabaseFile)
	2012.2.24
		dict is via picard, also required for GATK
		fai is via "samtools faidx" (index reference). also required for GATK
		amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa' are all bwa index.
		stidx is stampy index.
		sthash is stampy hash.
	2012.2.23
		add two suffixes, stidx (stampy index) and sthash (stampy hash)
	2011-11-11
		if needAffiliatedFiles,
			all other files, with suffix in affiliateFilenameSuffixLs, will be registered (symlinked or copied) as well.
	"""
    returnData = PassingData(refFastaFList = [], needBWARefIndexJob=False, needSAMtoolsFastaIndexJob=False, \
          needPicardFastaDictJob=False, needStampyRefIndexJob=False, needBlastMakeDBJob=False,\
          refPicardFastaDictF=None, refSAMtoolsFastaIndexF=None)
    missingSuffixSet = set()  #2013.3.20

    if registerAffiliateFiles:
        refFastaF = File(
            os.path.join(folderName, os.path.basename(refFastaFname))
        )  #use relative path, otherwise, it'll go to absolute path
        # Add it into replica only when needed.
        refFastaF.addPFN(PFN("file://" + refFastaFname, input_site_handler))
        if not workflow.hasFile(refFastaF):  #2013.08.12
            workflow.addFile(refFastaF)
        returnData.refFastaFList.append(refFastaF)
        # If it's not needed, assume the index is done and all relevant files are in absolute path.
        # and no replica transfer

        #add extra affiliated files
        suffix2PathToFileLs = {}
        if addPicardDictFile:  #2012.5.23
            picardDictSuffix = 'dict'
            pathToFile = '%s.%s' % (
                os.path.splitext(refFastaFname)[0], picardDictSuffix
            )  #remove ".fasta" from refFastaFname
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                missingSuffixSet.add(picardDictSuffix)
                #suffix2PathToFileLs.append(pathToFile)
            else:
                suffix2PathToFileLs[picardDictSuffix] = pathToFile
        for suffix in affiliateFilenameSuffixLs:
            pathToFile = '%s.%s' % (refFastaFname, suffix)
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                missingSuffixSet.add(suffix)
                continue
            suffix2PathToFileLs[suffix] = pathToFile
        for suffix, pathToFile in suffix2PathToFileLs.iteritems():
            if checkAffiliateFileExistence and not os.path.isfile(pathToFile):
                sys.stderr.write(
                    "Warning: %s don't exist or not a file on file system. skip registration.\n"
                    % (pathToFile))
                continue
            affiliateF = File(
                os.path.join(folderName, os.path.basename(pathToFile)))
            #use relative path, otherwise, it'll go to absolute path
            affiliateF.addPFN(PFN("file://" + pathToFile, input_site_handler))
            if not workflow.hasFile(affiliateF):  #2013.08.12
                workflow.addFile(affiliateF)
            returnData.refFastaFList.append(affiliateF)

            if suffix == 'dict':  #2013.3.26
                returnData.refPicardFastaDictF = affiliateF
            elif suffix == 'fai':
                returnData.refSAMtoolsFastaIndexF = affiliateF
    else:
        refFastaF = File(
            os.path.join(folderName, os.path.basename(refFastaFname)))
        returnData.refFastaFList.append(refFastaF)
    if 'bwt' in missingSuffixSet or 'pac' in missingSuffixSet:
        returnData.needBWARefIndexJob = True
    if 'fai' in missingSuffixSet:
        returnData.needSAMtoolsFastaIndexJob = True
        returnData.needPicardFastaDictJob = True
    if 'stidx' in missingSuffixSet or 'sthash' in missingSuffixSet:
        returnData.needStampyRefIndexJob = True
    if 'dict' in missingSuffixSet:
        returnData.needPicardFastaDictJob = True
    if 'nin' in missingSuffixSet or 'nhr' in missingSuffixSet or 'nsq' in missingSuffixSet:
        returnData.needBlastMakeDBJob = True
    return returnData
Example #8
0
    def write_dax(self, filename='workflow.dax', name='workflow'):
        """Generate Pegasus abstract workflow (DAX).

        Parameters
        ----------
        filename : `str`
            File to write the DAX to.
        name : `str`, optional
            Name of the DAX.

        Returns
        -------
        `Pegasus.ADAG`
            Abstract workflow used by Pegasus' planner.

        Raises
        ------
        `ValueError`
            If either task or file node is missing mandatory attribute.
        """
        dax = ADAG(name)

        # Process file nodes.
        for file_id in self.files:
            attrs = self.graph.node[file_id]
            try:
                name = attrs['lfn']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('lfn'))
            file_ = File(name)

            # Add physical file names, if any.
            urls = attrs.get('pfn')
            if urls is not None:
                urls = urls.split(',')
                sites = attrs.get('sites')
                if sites is None:
                    sites = len(urls) * ['condorpool']
                for url, site in zip(urls, sites):
                    file_.addPFN(PFN(url, site))

            self.catalog[attrs['lfn']] = file_

        # Add jobs to the DAX.
        for task_id in self.tasks:
            attrs = self.graph.node[task_id]
            try:
                name = attrs['exec_name']
            except KeyError:
                msg = 'Mandatory attribute "%s" is missing.'
                raise AttributeError(msg.format('exec_name'))
            label = '{name}_{id}'.format(name=name, id=task_id)
            job = Job(name, id=task_id, node_label=label)

            # Add job command line arguments replacing any file name with
            # respective Pegasus file object.
            args = attrs.get('exec_args', [])
            if args:
                args = args.split()
                lfns = list(set(self.catalog) & set(args))
                if lfns:
                    indices = [args.index(lfn) for lfn in lfns]
                    for idx, lfn in zip(indices, lfns):
                        args[idx] = self.catalog[lfn]
                job.addArguments(*args)

            # Specify job's inputs.
            inputs = [file_id for file_id in self.graph.predecessors(task_id)]
            for file_id in inputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.INPUT)

            # Specify job's outputs
            outputs = [file_id for file_id in self.graph.successors(task_id)]
            for file_id in outputs:
                attrs = self.graph.node[file_id]
                is_ignored = attrs.get('ignore', False)
                if not is_ignored:
                    file_ = self.catalog[attrs['lfn']]
                    job.uses(file_, link=Link.OUTPUT)

                    streams = attrs.get('streams')
                    if streams is not None:
                        if streams & 1 != 0:
                            job.setStdout(file_)
                        if streams & 2 != 0:
                            job.setStderr(file_)

            # Provide default files to store stderr and stdout, if not
            # specified explicitly.
            if job.stderr is None:
                file_ = File('{name}.out'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStderr(file_)
            if job.stdout is None:
                file_ = File('{name}.err'.format(name=label))
                job.uses(file_, link=Link.OUTPUT)
                job.setStdout(file_)

            dax.addJob(job)

        # Add job dependencies to the DAX.
        for task_id in self.tasks:
            parents = set()
            for file_id in self.graph.predecessors(task_id):
                parents.update(self.graph.predecessors(file_id))
            for parent_id in parents:
                dax.depends(parent=dax.getJob(parent_id),
                            child=dax.getJob(task_id))

        # Finally, write down the workflow in DAX format.
        with open(filename, 'w') as f:
            dax.writeXML(f)