def path_to_pegasus_file(path: Path, *, site: str = "local", name: Optional[str] = None, is_raw_input: bool = False) -> File: """ Given a *path* object return a pegasus `File` for usage in a workflow If the resource is not on a local machine provide the *site* string. Files can be used for either an input or output of a Job. Args: path: path to the file site: site to be used, default is local. Should be set to saga if running on cluster. name: name given to the file is_raw_input: indicates that the file doesn't come from the output of another job in the workflow, so can be safely added to the Pegasus DAX Returns: Pegasus File at the given path """ rtnr = File(name if name else str(path.absolute()).replace("/", "-")) if is_raw_input: rtnr.addPFN(path_to_pfn(path, site=site)) return rtnr
def registerFile(workflow, filename): """ 2011.12.13 function to register any file to the workflow.input_site_handler, """ file = File(os.path.basename(filename)) file.addPFN(PFN("file://" + os.path.abspath(filename), \ workflow.input_site_handler)) workflow.addFile(file) return file
def createFile (self, fileName, fileURL=None, site=None): #traceback.print_stack () file = self.getFile (fileName) if not file: file = File (fileName) if not fileURL: fileURL = "file://%s/%s" % (self.wms.getOutputDir (), fileName) logger.debug ("fileurl: %s", fileURL) if not site: site = "local" if not isinstance(fileURL, basestring) and len (fileURL) > 0: fileURL = fileURL [0] logger.debug ("--add-pfn: (%s)(%s)(%s)", fileName, fileURL, site) pfn = PFN (fileURL, site) file.addPFN (pfn) self.files [fileName] = file return file
def write(self, filename, name='dax'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. """ dax = ADAG(name) # Add files to DAX-level replica catalog. catalog = {} for file_id in self.files: attrs = self.graph.node[file_id] f = File(attrs['lfn']) # Add physical file names, if any. urls = attrs.get('urls') if urls is not None: sites = attrs.get('sites') if sites is None: sites = ','.join(len(urls) * ['local']) for url, site in zip(urls.split(','), sites.split(',')): f.addPFN(PFN(url, site)) catalog[attrs['lfn']] = f dax.addFile(f) # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] job = Job(name=attrs['name'], id=task_id) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('args') if args is not None and args: args = args.split() lfns = list(set(catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] f = catalog[attrs['lfn']] job.uses(f, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(f) if streams & 2 != 0: job.setStderr(f) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)
}) config.read(sys.argv[2] + '/test.config') # Create an abstract dag cluster = ADAG(config.get('all', 'workflow_name')) input_file = config.get('all', 'input_file') if (input_file == ''): input_file = os.getcwd() else: input_file += '/' + os.getenv('USER') + '/inputs' # Add input file to the DAX-level replica catalog a = File("f.a") a.addPFN( PFN( config.get('all', 'file_url') + input_file + "/f.a", config.get('all', 'file_site'))) cluster.addFile(a) for i in range(1, 3): sleep = Executable(namespace="cluster", name="level" + str(i), version="1.0", os="linux", arch="x86_64", installed=config.getboolean('all', 'executable_installed')) sleep.addPFN( PFN( config.get('all', 'executable_url') + sys.argv[1] + "/bin/pegasus-keg", config.get('all', 'executable_site')))
config = ConfigParser.ConfigParser({'input_file':'', 'workflow_name':'horizontal-clustering-test', 'executable_installed':"False", 'clusters_size':"3", 'clusters_maxruntime':"7"}) config.read(sys.argv[2] + '/test.config') # Create an abstract dag cluster = ADAG (config.get('all', 'workflow_name')) input_file = config.get('all', 'input_file') if (input_file == ''): input_file = os.getcwd () else: input_file += '/' + os.getenv ('USER') + '/inputs' # Add input file to the DAX-level replica catalog a = File("f.a") a.addPFN(PFN(config.get('all', 'file_url') + input_file + "/f.a", config.get('all', 'file_site'))) cluster.addFile(a) for i in range (1, 3): sleep = Executable (namespace = "cluster", name = "level" + str (i), version = "1.0", os = "linux", arch = "x86", installed=config.getboolean('all', 'executable_installed')) sleep.addPFN (PFN (config.get('all', 'executable_url') + sys.argv[1] + "/bin/pegasus-keg", config.get('all', 'executable_site'))) sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.size", value = config.get('all', 'clusters_size'))) sleep.addProfile (Profile (namespace = "pegasus", key = "clusters.maxruntime", value = config.get('all', 'clusters_maxruntime'))) cluster.addExecutable(sleep) for i in range (4): job = Job (namespace = "cluster", name = "level1", version = "1.0") job.addArguments('-a level1 -T ' + str (i + 1)) job.addArguments('-i', a) job.addProfile (Profile (namespace = "pegasus", key = "job.runtime", value = str (i + 1))) job.uses(a, link=Link.INPUT)
def registerRefFastaFile(workflow=None, refFastaFname=None, registerAffiliateFiles=True, input_site_handler='local',\ checkAffiliateFileExistence=True, addPicardDictFile=True,\ affiliateFilenameSuffixLs=['fai', 'amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa', \ 'stidx', 'sthash'], folderName="reference"): """ suffix here doesn't include ".". 2013.08.23 bugfix, check if workflow has a file registered before adding it 2013.3.26 added refSAMtoolsFastaIndexF, refPicardFastaDictF into returnData 2013.3.20 deduce needBWARefIndexJob, needSAMtoolsFastaIndexJob, needPicardFastaDictJob, needStampyRefIndexJob from missing suffixes 2010.10.10 added argument folderName 2012.5.23 add an argument "addPicardDictFile" to offer user option to exclude this file (i.e. in registerBlastNucleotideDatabaseFile) 2012.2.24 dict is via picard, also required for GATK fai is via "samtools faidx" (index reference). also required for GATK amb', 'ann', 'bwt', 'pac', 'sa', 'rbwt', 'rpac', 'rsa' are all bwa index. stidx is stampy index. sthash is stampy hash. 2012.2.23 add two suffixes, stidx (stampy index) and sthash (stampy hash) 2011-11-11 if needAffiliatedFiles, all other files, with suffix in affiliateFilenameSuffixLs, will be registered (symlinked or copied) as well. """ returnData = PassingData(refFastaFList = [], needBWARefIndexJob=False, needSAMtoolsFastaIndexJob=False, \ needPicardFastaDictJob=False, needStampyRefIndexJob=False, needBlastMakeDBJob=False,\ refPicardFastaDictF=None, refSAMtoolsFastaIndexF=None) missingSuffixSet = set() #2013.3.20 if registerAffiliateFiles: refFastaF = File( os.path.join(folderName, os.path.basename(refFastaFname)) ) #use relative path, otherwise, it'll go to absolute path # Add it into replica only when needed. refFastaF.addPFN(PFN("file://" + refFastaFname, input_site_handler)) if not workflow.hasFile(refFastaF): #2013.08.12 workflow.addFile(refFastaF) returnData.refFastaFList.append(refFastaF) # If it's not needed, assume the index is done and all relevant files are in absolute path. # and no replica transfer #add extra affiliated files suffix2PathToFileLs = {} if addPicardDictFile: #2012.5.23 picardDictSuffix = 'dict' pathToFile = '%s.%s' % ( os.path.splitext(refFastaFname)[0], picardDictSuffix ) #remove ".fasta" from refFastaFname if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) missingSuffixSet.add(picardDictSuffix) #suffix2PathToFileLs.append(pathToFile) else: suffix2PathToFileLs[picardDictSuffix] = pathToFile for suffix in affiliateFilenameSuffixLs: pathToFile = '%s.%s' % (refFastaFname, suffix) if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) missingSuffixSet.add(suffix) continue suffix2PathToFileLs[suffix] = pathToFile for suffix, pathToFile in suffix2PathToFileLs.iteritems(): if checkAffiliateFileExistence and not os.path.isfile(pathToFile): sys.stderr.write( "Warning: %s don't exist or not a file on file system. skip registration.\n" % (pathToFile)) continue affiliateF = File( os.path.join(folderName, os.path.basename(pathToFile))) #use relative path, otherwise, it'll go to absolute path affiliateF.addPFN(PFN("file://" + pathToFile, input_site_handler)) if not workflow.hasFile(affiliateF): #2013.08.12 workflow.addFile(affiliateF) returnData.refFastaFList.append(affiliateF) if suffix == 'dict': #2013.3.26 returnData.refPicardFastaDictF = affiliateF elif suffix == 'fai': returnData.refSAMtoolsFastaIndexF = affiliateF else: refFastaF = File( os.path.join(folderName, os.path.basename(refFastaFname))) returnData.refFastaFList.append(refFastaF) if 'bwt' in missingSuffixSet or 'pac' in missingSuffixSet: returnData.needBWARefIndexJob = True if 'fai' in missingSuffixSet: returnData.needSAMtoolsFastaIndexJob = True returnData.needPicardFastaDictJob = True if 'stidx' in missingSuffixSet or 'sthash' in missingSuffixSet: returnData.needStampyRefIndexJob = True if 'dict' in missingSuffixSet: returnData.needPicardFastaDictJob = True if 'nin' in missingSuffixSet or 'nhr' in missingSuffixSet or 'nsq' in missingSuffixSet: returnData.needBlastMakeDBJob = True return returnData
def write_dax(self, filename='workflow.dax', name='workflow'): """Generate Pegasus abstract workflow (DAX). Parameters ---------- filename : `str` File to write the DAX to. name : `str`, optional Name of the DAX. Returns ------- `Pegasus.ADAG` Abstract workflow used by Pegasus' planner. Raises ------ `ValueError` If either task or file node is missing mandatory attribute. """ dax = ADAG(name) # Process file nodes. for file_id in self.files: attrs = self.graph.node[file_id] try: name = attrs['lfn'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('lfn')) file_ = File(name) # Add physical file names, if any. urls = attrs.get('pfn') if urls is not None: urls = urls.split(',') sites = attrs.get('sites') if sites is None: sites = len(urls) * ['condorpool'] for url, site in zip(urls, sites): file_.addPFN(PFN(url, site)) self.catalog[attrs['lfn']] = file_ # Add jobs to the DAX. for task_id in self.tasks: attrs = self.graph.node[task_id] try: name = attrs['exec_name'] except KeyError: msg = 'Mandatory attribute "%s" is missing.' raise AttributeError(msg.format('exec_name')) label = '{name}_{id}'.format(name=name, id=task_id) job = Job(name, id=task_id, node_label=label) # Add job command line arguments replacing any file name with # respective Pegasus file object. args = attrs.get('exec_args', []) if args: args = args.split() lfns = list(set(self.catalog) & set(args)) if lfns: indices = [args.index(lfn) for lfn in lfns] for idx, lfn in zip(indices, lfns): args[idx] = self.catalog[lfn] job.addArguments(*args) # Specify job's inputs. inputs = [file_id for file_id in self.graph.predecessors(task_id)] for file_id in inputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.INPUT) # Specify job's outputs outputs = [file_id for file_id in self.graph.successors(task_id)] for file_id in outputs: attrs = self.graph.node[file_id] is_ignored = attrs.get('ignore', False) if not is_ignored: file_ = self.catalog[attrs['lfn']] job.uses(file_, link=Link.OUTPUT) streams = attrs.get('streams') if streams is not None: if streams & 1 != 0: job.setStdout(file_) if streams & 2 != 0: job.setStderr(file_) # Provide default files to store stderr and stdout, if not # specified explicitly. if job.stderr is None: file_ = File('{name}.out'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStderr(file_) if job.stdout is None: file_ = File('{name}.err'.format(name=label)) job.uses(file_, link=Link.OUTPUT) job.setStdout(file_) dax.addJob(job) # Add job dependencies to the DAX. for task_id in self.tasks: parents = set() for file_id in self.graph.predecessors(task_id): parents.update(self.graph.predecessors(file_id)) for parent_id in parents: dax.depends(parent=dax.getJob(parent_id), child=dax.getJob(task_id)) # Finally, write down the workflow in DAX format. with open(filename, 'w') as f: dax.writeXML(f)