def getLogicalRecordCount(self, arg_filename): """ get an approximate logical record count for a fasta file """ filenames = [arg_filename] if self.isListFile(arg_filename): with open(arg_filename, "r") as file_list: filenames = [record.strip() for record in file_list] record_count = 0 for filename in filenames: try: count_command = ["kseq_count", "-a", filename] self.logWriter.info("getLogicalRecordCount executing: %s" % " ".join(count_command)) proc = subprocess.Popen(count_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() record_count += int(stdout) except Exception as e: self.logWriter.info( "getLogicalRecordCount -failed getting logical record count : %s" % str(e)) raise tutils.tardisException( "getLogicalRecordCount -failed getting logical record count : %s" % str(e)) self.logWriter.info( "getLogicalRecordCount estimates there are %d records in %s" % (record_count, arg_filename)) return record_count
def runCommand(self, argCommand=None): command = argCommand if argCommand is None: command = self.command if len(command) > 0: self.logWriter.info("condorhpcJob : running %s"%str(command)) # set up the shell scriptfile(s) (one per chunk) (unless this is a rerun in which case its already been done) if self.submitCount == 0: runtime_environmentcode = self.runtime_config_template.safe_substitute() # currently no templating actually done here shellcode = self.shell_script_template.safe_substitute(configure_runtime_environment=runtime_environmentcode,\ hpcdir=self.workingRoot,command=string.join(self.command," "),\ startdir=self.controller.options["startdir"], input_conditioning=str(self.controller.options["input_conditioning"])) self.scriptfilename = os.path.join(self.workingRoot, "run%d.sh"%self.jobNumber) if os.path.isfile(self.scriptfilename): raise tutils.tardisException("error %s already exists"%self.scriptfilename) f=open(self.scriptfilename,"w") self.logWriter.info("condorhpcJob : condor shell script wrapper is %s"%self.scriptfilename) f.writelines(shellcode) f.close() os.chmod(self.scriptfilename, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH ) # set up the condor jobfile (one per chunk) (unless already done) if self.submitCount == 0: self.logname=re.sub("\.sh$",".log",self.scriptfilename) self.stderrnamepattern = "%s\.err\.\S+$"%re.escape(os.path.basename(self.scriptfilename)) self.stdoutnamepattern = "%s\.out\.\S+$"%re.escape(os.path.basename(self.scriptfilename)) self.jobfilename=re.sub("\.sh$",".job",self.scriptfilename) jobcode = self.job_template.safe_substitute(script=self.scriptfilename,log=self.logname,rundir=self.workingRoot) self.logWriter.info("condorhpcJob : condor job file is %s"%self.jobfilename) f=open(self.jobfilename,"w") f.writelines(jobcode) f.close() # submit the condor job condor_submit = ["condor_submit", self.jobfilename] if self.controller.options["dry_run"] : self.logWriter.info("condorhpcJob : this is a dry run - not launching the job") else: self.logWriter.info("condorhpcJob : launching using %s"%str(condor_submit)) self.proc = subprocess.Popen(condor_submit,stdout=subprocess.PIPE, stderr=subprocess.PIPE) self.submitCount += 1 (self.stdout, self.stderr) = self.proc.communicate() self.submitreturncode = self.proc.returncode self.logWriter.info("condorhpcJob : %s has returned (status %s) - here is its output (but now we wait for the real output !)"%(str(condor_submit), self.submitreturncode)) self.logWriter.info("condorhpcJob : stdout : \n%s"%self.stdout) self.logWriter.info("condorhpcJob : stderr : \n%s"%self.stderr) else: self.logWriter.info("condorhpcJob : nothing to do")
def tardis_main(): parser = argparse.ArgumentParser(description='Condition a command for execution on a cluster.') parser.add_argument('-w', '--in-workflow', dest='in_workflow', action='store_const', const=True, help='Run the command as part of a workflow. After launching all of the jobs, tardis waits for all outputs, which are then collated and merged into a single output file, as specified by the output file path in the original command; all of the temporary input files (for example chunks of uncompressed fastq) are deleted provided all prior steps completed without error (if there was an error they are left there to assist with debugging). Without this option, the program exits immediately after launching all of the jobs, and output is left un-collated in the scratch folder created by this script, and no cleanup is done.') parser.add_argument('-c', '--chunksize', dest='chunksize', type=int, metavar='N', help='When conditioning the input file(s), split into files each containing N logical records. (A logical record for a sequence file is a complete sequence. For a text file it is a line of text). (If the -s option is used to sample the inputs, the chunksize relates to the full un-sampled file . so the same chunk-size can be used whether random sampling or not. For example a chunksize of 1,000,000 is specified in combination with a sampling rate of .0001, then each chunk would contain 100 sequences . i.e. you should not adjust the chunk-size, for the sampling rate. Note that to avoid a race-condition that could be caused by a very small chunk-size resulting in launching a very large number of jobs, tardis will throw an exception if the chunk-size used would result in launching more than MAX_DIMENSION jobs (currently 5000) )') parser.add_argument('--from', '--from-record', dest='from_record', type=int, metavar='N', help='When conditioning the input file(s), only use records from the input file after or including N (where that is logical record number . e.g. in a fastq file, start from record number N means start from sequence N). By combining this option with -to, you can process slices of a file. Note that this option has no affect when processing a list-file.') parser.add_argument('--to', '--to-record', dest='to_record', type=int, metavar='N', help='When conditioning the input file(s), only use records up to and including the record N (where that is logical record number . e.g. in a fastq file, process up to record number N means process up to and including sequence N). By combining this option with -from, you can process slices of a file. Note that this option has no affect when processing a list-file.') parser.add_argument('-s', dest='samplerate', type=float, metavar='RATE', help='Rather than process the entire input file(s), a random sample of the records is processed. RATE is the probability that a given record will be sampled. For example -s .001 will result in roughly 1 in every 1000 logical records being sampled. When the -s option is specified, tardis does not clean up the conditioned input and output . e.g. all of the uncompressed fastq sample fragments would be retained. These are retained to assist with the Q/C work that is normally associated with a sampled run. Paired fastq input files are sampled in lock-step, provided the paired fastq conditioning directive is used for both files.') parser.add_argument('-d', '--rootdir', dest='rootdir', type=str, metavar='DIR', help='create the tardis working folder under DIR. If no working root is specified, a default location is used.') parser.add_argument('--dry-run', dest='dry_run', action='store_const', const=True, help='validate the run by doing a dry run. This means that the chunks, job scripts and job files etc. are all generated but the jobs are not launched. The user can start then kill (CTRL-C) the run, inspect the script and job files that were generated to check that their command has been conditioned as envisaged.') parser.add_argument('-k', '--keep-conditioned-data', dest='keep_conditioned_data', action='store_const', const=True, help='keep the conditioned input and output - i.e. the input and output fragments. Normally in workflow mode these are deleted after the output is successfully "unconditioned" - i.e. joined back together') parser.add_argument('--job-file', dest='jobtemplatefile', type=str, metavar='FILE', help='optionally supply a job template - tardis will read the contents of FILE and use this as the job template.') parser.add_argument('--templatedir', dest='templatedir', type=str, metavar='DIR', help='template directory') parser.add_argument('--job-template-name', dest='job_template_name', type=str, metavar='NAME', help='job template name, resolved in template directory') parser.add_argument('--hpctype', dest='hpctype', type=str, help='indicate the hpc environment. Currently the only supported values are: condor which results in tardis attempting to set up and launch condor jobs; local which results in each job being launched by tardis itself on the local machine, using the native python sub-process API. The maximum number of processes it will run at a time is controlled by a global variable in the script MAX_PROCESSES, which is initially 10; slurm which results in tardis attempting to set up and launch slurm jobs.') parser.add_argument('--batonfile', dest='batonfile', type=str, metavar='FILE', help='if you supply a "baton file" FILE, tardis will write the process exit code to this file after all processing has completed. This can be useful to preserve synchronous execution of a workflow, even if tardis is started in the background - the workflow can test the existence of the batonfile - if it exists then the corresponding tardis processing step has completed (i.e. another way of each step in a workflow "passing the baton" to the next step)') parser.add_argument('--shell-include-file', '--runtimeconfigsourcefile', dest='runtimeconfigsourcefile', type=str, metavar='FILE', help='shell script fragment included in jobs') parser.add_argument('-q', '--quiet', dest='quiet', action='store_const', const=True, help='run quietly') parser.add_argument('--userconfig', dest='userconfig', metavar='FILE', help='user configuration file') parser.add_argument('--no-sysconfig', dest='no_sysconfig', action='store_true', default=False, help='ignore the system configuration file') parser.add_argument('command', help='command to run') parser.add_argument('arg', nargs=argparse.REMAINDER, help='command arguments') args = parser.parse_args() options = dict((k,v) for k,v in vars(args).iteritems() if v is not None and k != 'command') # filter command args command_args = [args.command] + args.arg for arg in command_args: if re.search("[\!\&]|(?<!\w)rm(?!\w)|(?<!\w)mv(?!\w)|(?<!\w)cp(?!\w)", arg) != None: # do not allow irrelevant/dangerous shell chars raise tardisException("error : dangerous argument to shell ( %s ) - will not run this"%arg) #args.remove(arg) if len(args.command) < 1: raise tardisException("please supply a valid command to condition and run (type tardis -h for usage") try: options = tutils.mergeOptionsWithConfig(options) except tutils.tardisException, msg: print >> sys.stderr, msg return 2
def run(toolargs, options, stdout = sys.stdout, stderr=sys.stderr, checkCommandIsValid = True): # some merging / prioritisation of options is needed in some cases. msg_for_log=None # we don't have a logger yet - will log this later when we do if options.get("job_template_name",None) is not None and options.get("jobtemplatefile",None) is not None: msg_for_log= "warning - a job template filename was specified (%s) - this overrides the job template name specified (%s)"%(options.get("jobtemplatefile",None), options.get("job_template_name",None)) del options["job_template_name"] # if we check the command is supported on this tardis if checkCommandIsValid: if not isCommandValid(toolargs, options["valid_command_patterns"]): print >> stderr, "%s is not supported by this tardis engine"%toolargs[0] return 2 #print "using %s"%str(options) if not options["quiet"]: print "tool args = %s"%str(toolargs) # set up logging and working folder for this run (l,workingRoot) = factory.hpcConditioner.getLogger(options) logger = tutils.tardisLogger(l) # log msg_for_log if we have one if msg_for_log is not None: logger.info(msg_for_log) if not options["quiet"]: print "tardis.py : logging this session to %s"%workingRoot logger.info("tardis options : " + str(options)) c = factory.hpcConditioner(logger,workingRoot,options,toolargs) c.options = options c.logWriter.info("tardis.py : logging this session to %s"%workingRoot) #c.logWriter.info("using %s"%str(options)) c.logWriter.info("tool args = %s"%str(toolargs)) # create a prototype data conditioner. This won't actually do any data conditioning, # but will be used to induct subsequent conditioners, by passing on shared # shared objects dcPrototype=data.dataConditioner() dcPrototype.logWriter = logger dcPrototype.workingRoot = workingRoot dcPrototype.jobcontroller = c dcPrototype.logWriter.info("prototype dataConditioner created") dcPrototype.options = options #hpcConditioner.logWriter.info("main : requesting conditioned commands") conditionedCommandIter = c.getConditionedCommandGenerator(dcPrototype) conditionedInputGenerators = dcPrototype.getConditionedInputGenerators() for conditionedInputs in conditionedInputGenerators: dcPrototype.distributeAvailableInputs(conditionedInputs) cmd = conditionedCommandIter.next() c.logWriter.info("setting up job for conditioned command : %s"%str(cmd)) job = c.gethpcJob(cmd) job.runCommand() # check for partially submitted jobs here in case we are rate limited - otherwise we will have to wait until all chunks # have been written. This will also do a wait on the jobs that are running if c.hpcClass == local.localhpcJob: c.logWriter.info("(running jobs locally and there are %d partially submitted jobs)"%len(c.getUnsubmittedJobs())) if len(c.getUnsubmittedJobs()) > 0: c.retryJobSubmission(maxRetries = 1, retryPause = 1) # for some hpc types (e.g. slurm array jobs) , runCommand does not actually run the command, it # just sets up the comamnd. Thse are then all batch submitted here : c.launchArrayJobs() # if in a workflow, or conditioning output, and not a dry run , poll for results if (options["in_workflow"] or len(dcPrototype.outputUnconditioners) > 0) and not options["dry_run"] : c.logWriter.info("tardis.py : done setting up jobs - polling for results (and submitting any queued jobs)") for dc in dcPrototype.outputUnconditioners: # (if in a workflow and no unconditioners were specified, then a default one # will have been created) # results are sent to each output unconditioner # clear sent flag for job in c.jobList: job.sent = False poll_count = 0 while True: poll_count +=1 if poll_count * hpc.hpcJob.POLL_INTERVAL > hpc.hpcJob.POLL_DURATION: raise tardisException("error in tardis.py session - bailing out as we have been hanging around waiting for output for far too long ! ") unsentJobs = [ job for job in c.jobList if not job.sent ] if len(unsentJobs) == 0: break # retry jobs here in case we are rate limited if len(c.getUnsubmittedJobs()) > 0: c.logWriter.info("(there are %d partially submitted jobs)"%len(c.getUnsubmittedJobs())) c.retryJobSubmission(maxRetries = hpc.hpcJob.SUBMIT_RETRIES, retryPause = hpc.hpcJob.SUBMIT_RETRY_PAUSE) sent_count = 0 # count how many jobs just finished for unsentJob in unsentJobs: unsentJob.sendAvailableOutput(dc.outputCollector, dc.productCollector) if unsentJob.sent: sent_count += 1 # if no jobs just finished , wait for awhile , otherwise go back for more output immediately if sent_count == 0: time.sleep(hpc.hpcJob.POLL_INTERVAL) c.logWriter.info("%s output unconditioners are unconditioning"%len(dcPrototype.outputUnconditioners)) # uncondition all output for dc in dcPrototype.outputUnconditioners: dc.unconditionOutput() # only remove the conditioned output if we are in a workflow and we are not sampling and no error state was set , and KEEP_CONDITIONED_DATA is # not set if options["in_workflow"] and options["samplerate"] is None and dcPrototype.getDataResultState() == data.dataConditioner.OK and \ c.getJobResultState() == hpc.hpcJob.OK and not options["keep_conditioned_data"]: for dc in dcPrototype.outputUnconditioners: dc.logWriter.info("removing conditioned output") dc.removeConditionedOutput() else: c.logWriter.info("either not in workflow or sampling , or error state set , not removing conditioned output") # stream the output from all jobs to stdout of this job c.unconditionJobStreams(stdout,stderr) # do not uncondition input if sampling , or if options["keep_conditioned_data"] is set, or if an error state has been set if options["samplerate"] is None and dcPrototype.getDataResultState() == data.dataConditioner.OK \ and c.getJobResultState() == hpc.hpcJob.OK and not options["keep_conditioned_data"]: c.logWriter.info("%s input conditioners are unconditioning the following files : %s"%\ (len(dcPrototype.getDistinctInputConditioners()),\ string.join([dc.inputFileName for dc in dcPrototype.getDistinctInputConditioners()]," , ")\ )\ ) c.logWriter.info("unconditioning input") for dc in dcPrototype.getDistinctInputConditioners(): dc.removeConditionedInput() else: c.logWriter.info("not unconditioning input as either sampling was specified, or keep conditioned input was set, or error state is set due to a previous error") else: c.logWriter.info("tardis.py : not in a workflow and no output unconditioners (or this is a dry run) - exiting") if dcPrototype.getDataResultState() == data.dataConditioner.OK and c.getJobResultState() == hpc.hpcJob.OK : c.logWriter.info("tardis.py : done logging this session to %s , no errors detected"%workingRoot) if not options["quiet"]: print "tardis.py : done logging this session to %s , no errors detected"%workingRoot if len(c.getJobResultStateDescription()) > 0: c.logWriter.info(c.getJobResultStateDescription()) if not options["quiet"]: print c.getJobResultStateDescription() print >> stderr, c.getJobResultStateDescription() if len(dcPrototype.getDataResultStateDescription()) > 0: c.logWriter.info(dcPrototype.getDataResultStateDescription()) if not options["quiet"]: print dcPrototype.getDataResultStateDescription() print >> stderr, dcPrototype.getDataResultStateDescription() return (0,c) else: c.logWriter.info("tardis.py : done logging this session to %s. NOTE : some errors were logged"%workingRoot) if not options["quiet"]: print "tardis.py : done logging this session to %s. NOTE : some errors were logged"%workingRoot if len(c.getJobResultStateDescription()) > 0: c.logWriter.info(c.getJobResultStateDescription()) if not options["quiet"]: print c.getJobResultStateDescription() print >> stderr, c.getJobResultStateDescription() if len(dcPrototype.getDataResultStateDescription()) > 0: c.logWriter.info(dcPrototype.getDataResultStateDescription()) if not options["quiet"]: print dcPrototype.getDataResultStateDescription() print >> stderr, dcPrototype.getDataResultStateDescription() return (2,c)
def launchArrayJobs(self): """ this is only applicable to slurm jobs. This is called after the job scripts have all been created. One or more array jobs are launched (more than one , if the number of jobs is > SLURM_MAXARRAYSIZE=1000. The array job looks roughly like this : #!/bin/bash -e #SBATCH -J $tardis_job_moniker #SBATCH -A $tardis_account_moniker # Project Account #SBATCH --time=20:00:00 # Walltime #SBATCH --ntasks=1 # number of parallel processes #SBATCH --ntasks-per-socket=1 # number of processes allowed on a socket #SBATCH --cpus-per-task=4 #number of threads per process #SBATCH --hint=multithread # enable hyperthreading #SBATCH --mem-per-cpu=8G #SBATCH --partition=inv-iranui # Use nodes in the IRANUI partition #SBATCH --array=1-$array_size%50 # Iterate 1 to N, but only run up to 50 concurrent runs at once #SBATCH --error=$script-%A_%a.err #SBATCH --output=$script-%A_%a.out srun --cpu_bind=v,threads ${SLURM_ARRAY_TASK_ID} """ # "slurm_array_job" is launched by sbatch , and internally launches a shim script, passing # to it the index of the job to run. The shim then just executes run1.sh, run2.sh # - which are instances of "slurm_shell". if self.hpcClass != slurm.slurmhpcJob: return # write the slurm array shim to the working folder slurm_array_shim = string.Template( tutils.getTemplateContent(self.options, "slurm_array_shim", logWriter=self.logWriter)) shimcode = slurm_array_shim.safe_substitute(hpcdir=self.workingRoot) shim_file_name = os.path.join(self.workingRoot, "slurm_array_shim.sh") f = open(shim_file_name, "w") self.logWriter.info("hpcConditioner : writing array shim") f.writelines(shimcode) f.close() os.chmod( shim_file_name, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # write one or more array job files # figure out a job template from the options. (You can specify one of the inbuilt templates by name, or # supply a file containing a custom template). If nothign supplied at all we use hard coded default job_template_name = self.options.get("job_template_name", None) job_template_filename = self.options.get("jobtemplatefile", None) if job_template_name is None and job_template_filename is None: #use the default condor job template job_template_name = "default_slurm_array_job" if job_template_name is not None and job_template_filename is not None: raise tutils.tardisException( "error both job_template_name (%s) and job_template_filename (%s) defined - only define one of these" % (job_template_name, job_template_filename)) if job_template_name is not None: job_template = tutils.getTemplateContent(self.options, job_template_name, logWriter=self.logWriter) else: if not os.path.isfile(job_template_filename): raise tutils.tardisException( "error job template file %s not found" % job_template_filename) job_template = string.join(file(job_template_filename, "r"), "") if job_template is None: raise tutils.tardisException( "hpcConditioner: Error job template is null after templating") job_template = string.Template(job_template) n_launched = 0 while n_launched < len(self.jobList): n_launch = min(SLURM_MAXARRAYSIZE, len(self.jobList) - n_launched) arraycode = job_template.safe_substitute(tardis_job_moniker=self.toolargv[0], tardis_account_moniker=os.environ['LOGNAME'],\ array_start=str(n_launched+1),array_stop=str(n_launched+n_launch),\ hpcdir=self.workingRoot) array_jobfile_name = os.path.join( self.workingRoot, "array_%d-%d.slurm" % (n_launched + 1, n_launched + n_launch)) f = open(array_jobfile_name, "w") self.logWriter.info("hpcConditioner : writing array job %s" % array_jobfile_name) f.writelines(arraycode) f.close() os.chmod( array_jobfile_name, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) # launch if we need to if self.options["dry_run"]: self.logWriter.info( "slurmhpcJob : this is a dry run - not launching the job") else: slurm_submit = ["sbatch", "-v", array_jobfile_name] self.logWriter.info("slurmhpcJob : launching using %s" % str(slurm_submit)) proc = subprocess.Popen(slurm_submit, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() submitreturncode = proc.returncode if submitreturncode == 0: self.logWriter.info( "slurmhpcJob : %s has returned (status %s) - here is its output (but now we wait for the real output !)" % (str(slurm_submit), submitreturncode)) self.logWriter.info("slurmhpcJob : stdout : \n%s" % stdout) self.logWriter.info("slurmhpcJob : stderr : \n%s" % stderr) else: self.logWriter.info( "slurmhpcJob : error %s has returned status %s !)" % (str(slurm_submit), submitreturncode)) self.logWriter.info("slurmhpcJob : stdout : \n%s" % stdout) self.logWriter.info("slurmhpcJob : stderr : \n%s" % stderr) self.logWriter.info( "slurmhpcJob : giving up, the array job spec may have bugs ?" ) raise tutils.tardisException("hpcConditioner : %s" % stderr) n_launched += n_launch
def _slow_get_conditioned_filenames(caller, filename1, argchunksize, outdir, informat = "text", outformat = "text", samplerate = None ,filename2=None, pairBond = None,\ listfilename1 = None, listfilename2 = None, length_bounds = (None,None), record_filter_func=None, from_record = None, to_record = None): """ A generator. Split up a generic or structured text file, and return fragments as they become available, via yield. Structures supported are fasta, fastq (and text). Based on an original stand-alone script "slice_fastq.py" - updated to support paired fastq files and brought "in-house" to tardis, and implemented as a generator so that we can get chunknames and launch jobs , as the chunks become available. Returns a tuple : ((inputfilename1, inputfilename2), (fragmentname1, fragmentname2)) (The input names are returned as well, so that consumers of this generator know which original name each fragment relates to) The first element of the sub-tuples contains original / fragment-filenames obtained by (optionally uncompressing and) splitting up the first file. The second element of the sub-tuple contains either None, if there was only one file to process, or corresponding fragment filenames obtained by splitting up the second file in synch with the first file. The pairBond argument is a function (usually a lambda), which is applied to each pair of records from filename1 and filename2 , when processing two files. It tests whether they are in synch. For example for paired fastq files, this function could be "lambda x,y: x.name == y.name". The function should return True if a pair of records are in synch, or False if not. If the function returns False, then an exception is raised as this is unrecoverable - it indicates the pair of files are incompatible (e.g. - may indicate an upstream error in trimming of paired read files ) """ #named indexes LOWER = 0 UPPER = 1 # if chunksize zero yield empty chunknames and stop if argchunksize == 0: yield ((filename1, filename2), (None, None)) raise StopIteration # some arg checks if filename2 is None and pairBond != None: caller.logWriter.info( "getConditionedFilenames : warning pairBond function ignored, no second file" ) caller.logWriter.info("getConditionedFilenames : conditioning %s to %s chunksize %d informat %s outformat %s samplerate %s from %s to %s file2 %s"%(filename1, outdir, \ argchunksize , informat, outformat, samplerate, from_record, to_record , filename2)) chunknames1 = [] chunknames2 = [] # adjust chunksize if we are sampling chunksize = argchunksize if samplerate != None: chunksize = int(.5 + samplerate * argchunksize) if argchunksize > 0 and chunksize == 0: caller.error( "error - chunksize was rounded to zero after adjusting for sampling - please specify a chunksize which ignores your sampling rate (it will be adjusted later)" ) raise StopIteration # open infiles (infile1, uncompressedName1 ) = textDataConditioner.getUncompressedFilestream(filename1) infile2 = None if filename2 != None: (infile2, uncompressedName2 ) = textDataConditioner.getUncompressedFilestream(filename2) #if chunksize != 0: chunk = 1 chunksYieldedCount = 0 chunkname1 = os.path.basename(uncompressedName1) if filename2 != None: chunkname2 = os.path.basename(uncompressedName2) #print "DEBUG %s %s"%(uncompressedName1, chunkname1) # set up iterators over structured input records iter1 = None iter2 = None if informat in ("fastq", "fasta"): from Bio import SeqIO iter1 = SeqIO.parse(infile1, informat) if infile2 != None: iter2 = SeqIO.parse(infile2, informat) elif informat == "text": iter1 = infile1 if infile2 != None: iter2 = infile2 else: caller.error("unsupported input file format : %s" % informat) caller.logWriter.info("unsupported input file format : %s" % informat) raise StopIteration #raise tardisException("unsupported input file format : %s"%informat) # if we have a record filter, make iterators to apply this # ( currently only support a single filter - i.e. can't specify a different one for each pair) if record_filter_func is not None: caller.logWriter.info("inserting filter function") iter1 = (record_filter_func(unfiltered) for unfiltered in iter1) if iter2 is not None: iter2 = (record_filter_func(unfiltered) for unfiltered in iter2) # if there are two iterators zip them up to make an iterator over paired input. Else # make a paired iterator with the second iterator being a dummy repeat returning None piter = iter1 if iter1 != None and iter2 != None: piter = itertools.izip(iter1, iter2) else: piter = itertools.izip(iter1, itertools.repeat(None)) # output ! output_count = 0 input_count = 0 outfile1 = None outfile2 = None record1 = None record2 = None try: for (record1, record2) in piter: input_count += 1 # will sample the output if needed sampleBool = tutils.getSampleBool( samplerate) # 1 or 0 (always 1 if not sampling) # will length-filter the output if needed. if length_bounds != (None, None): for check_record in (record1, record2): if check_record is not None: if length_bounds[LOWER] is not None: if len(check_record) < length_bounds[LOWER]: sampleBool = 0 if length_bounds[UPPER] is not None: if len(check_record) > length_bounds[UPPER]: sampleBool = 0 # will slice the file(s) if required if from_record is not None: if input_count < from_record: sampleBool = 0 if to_record is not None: if input_count > to_record: sampleBool = 0 if sampleBool != 1: continue output_count += sampleBool if chunksize > 0: mychunk = 1 + int(output_count / (1.0 * chunksize)) else: mychunk = chunk # open a chunkfile if we need one if outfile1 is None: #outfilename1 = os.path.join(outdir, "%s.%05d.%s"%(chunkname1, chunk, outformat)) #outfilename1 = os.path.join(outdir, "%s.%05d"%(chunkname1, chunk)) name_parts = os.path.splitext(chunkname1) outfilename1 = os.path.join( outdir, "%s.%05d%s" % (name_parts[0], chunk, name_parts[1])) #print "DEBUG : %s %s"%(outdir, outfilename1) if os.path.exists(outfilename1): #raise tardisException("getConditionedFilenames : error - %s already exists"% outfilename1) caller.error( "getConditionedFilenames : error - %s already exists" % outfilename1) caller.logWriter.info( "the last sequences encountered before the error were : %s, %s" % (record1, record2)) raise StopIteration outfile1 = open(outfilename1, "w") chunknames1.append(outfilename1) if filename2 != None: #outfilename2 = os.path.join(outdir, "%s.%05d.%s"%(chunkname2, chunk, outformat)) #outfilename2 = os.path.join(outdir, "%s.%05d"%(chunkname2, chunk)) name_parts = os.path.splitext(chunkname2) outfilename2 = os.path.join( outdir, "%s.%05d%s" % (name_parts[0], chunk, name_parts[1])) if os.path.exists(outfilename2): #raise tardisException("getConditionedFilenames : error - %s already exists"% outfilename2) caller.error( "getConditionedFilenames : error - %s already exists" % outfilename2) caller.logWriter.info( "the last sequences encountered before the error were : %s, %s" % (record1, record2)) raise StopIteration outfile2 = open(outfilename2, "w") chunknames2.append(outfilename2) # if two files, check pair-bonding and if OK output both records if outfile1 != None and outfile2 != None and pairBond != None: if not pairBond(record1, record2): #raise tardisException("pair bonding error - %s does not bond with %s"%(str(record1), str(record2))) caller.error( "pair bonding error - %s does not bond with %s" % (str(record1), str(record2))) caller.logWriter.info( "the last sequences encountered before the error were : %s, %s" % (record1, record2)) raise StopIteration if outformat in ("fasta", "fastq"): outfile1.write(record1.format(outformat)) outfile2.write(record2.format(outformat)) else: outfile1.write(record1) outfile2.write(record2) elif outfile1 != None: if outformat in ("fasta", "fastq"): outfile1.write(record1.format(outformat)) else: outfile1.write(record1) # if need a new chunk, close and yield the old one (if there is one) if mychunk > chunk: chunkInfo = [None, None] if outfile1 != None: outfile1.close() outfile1 = None chunkInfo[0] = outfilename1 if outfile2 != None: outfile2.close() outfile2 = None chunkInfo[1] = outfilename2 if listfilename1 is not None and listfilename2 is not None: yield ((listfilename1, listfilename2), chunkInfo) elif listfilename1 is not None: yield ((listfilename1, filename2), chunkInfo) elif listfilename2 is not None: yield ((filename1, listfilename2), chunkInfo) else: yield ((filename1, filename2), chunkInfo) chunksYieldedCount += 1 #yield ((filename1, filename2),chunkInfo) chunk = mychunk if chunk > MAX_DIMENSION: #raise tardisException("error - too many chunks - please adjust chunksize to yield no more than %d chunks"%MAX_DIMENSION) caller.error( "error - too many chunks - please adjust chunksize to yield no more than %d chunks" % MAX_DIMENSION) caller.logWriter.info( "the last sequences encountered before the error were : %s, %s" % (record1, record2)) raise tutils.tardisException( "error - too many chunks - please adjust chunksize to yield no more than %d chunks" % MAX_DIMENSION) # handle exceptions that relate to problems with the data so we can report # where we are, then re-raise so we bail out. except ValueError, e: caller.error(e) caller.logWriter.info( "the last sequences encountered before the error were : %s, %s" % (record1, record2)) # #raise e raise StopIteration
def advance_chunk(caller, chunk, total_chunks_in, batonfile1, batonfile2, chunkbase1, chunkbase2, outdir, chunk_info): # if total_chunks has been set, and we are being asked # for a chunk number greater than this, raise StopIteration total_chunks = total_chunks_in if total_chunks is not None: if chunk > total_chunks: raise StopIteration # in a wait loop, poll wait_duration = 0 exception_count = 0 while True: # if we find either baton file , try reading the total number of chunks from it if we haven't already obtained this if total_chunks is None: if os.path.isfile(batonfile1): try: with open(batonfile1, "r") as bf: for record in bf: total_chunks = int( re.split("=", record.strip())[1]) caller.logWriter.info( "%d chunks in total were written (according to %s)" % (total_chunks, batonfile1)) break except Exception, e: # this could happen if we try to read the baton file at the same time as # the chunk-writer writes it. No action needed - will get it # next pass. But if we fail more than 50 times give up. caller.logWriter.info( "warning - exception (%s) reading batonfile %s" % (str(e), batonfile1)) caller.logWriter.info( "error - too many failed attempts to parse batonfile %s - giving up" % batonfile1) exception_count += 1 if exception_count >= 50: raise tutils.tardisException( "error - too many failed attempts to parse batonfile %s - giving up" % batonfile1) if total_chunks is None and batonfile2 is not None: if os.path.isfile(batonfile2): try: with open(batonfile2, "r") as bf: for record in bf: total_chunks = int( re.split("=", record.strip())[1]) caller.logWriter.info( "%d chunks in total were written (according to %s)" % (total_chunks, batonfile2)) break except Exception, e: # this could happen if we try to read the baton file at the same time as # the chunk-writer writes it. No action needed - will get it # next pass. But if we fail more than 50 times give up. caller.logWriter.info( "warning - exception (%s) reading batonfile %s" % (str(e), batonfile1)) caller.logWriter.info( "error - too many failed attempts to parse batonfile %s - giving up" % batonfile1) exception_count += 1 if exception_count >= 50: raise tutils.tardisException( "error - too many failed attempts to parse batonfile %s - giving up" % batonfile1)
def _fast_get_conditioned_filenames(caller, filename1, argchunksize, outdir, informat, outformat, samplerate,filename2,\ listfilename1 = None, listfilename2 = None, length_bounds = (None, None) , from_record = None, to_record = None): """ A generator. See also below, _slow_get_conditioned_filenames. This was the original version. Cloned and hacked to make this version. This method forks and calls one or subprocesses to do the actual split - the parent polls for the split files. This means that e.g. record_filter_func is not supported as this method does not have access to sequence objects. Other record oriented filters (e.g. by length , from - to, samplerate) are planned to be supported but are not yet """ #named indexes LOWER = 0 UPPER = 1 # if chunksize zero yield empty chunknames and stop if argchunksize == 0: yield ((filename1, filename2), (None, None)) raise StopIteration caller.logWriter.info("_fast_get_conditioned_filenames : conditioning %s to %s chunksize %d informat %s outformat %s samplerate %s from %s to %s file2 %s"%(filename1, outdir, \ argchunksize , informat, outformat, samplerate, from_record, to_record , filename2)) chunknames1 = [] chunknames2 = [] # (we don't adjust chunksize if we are sampling as kseq does it) chunksize = argchunksize # set various filenames that will be needed uncompressedName1 = textDataConditioner.getUncompressedBaseName(filename1) batonfile1 = os.path.join( outdir, "%s.chunk_stats$" % os.path.basename(uncompressedName1)) chunkbase1 = os.path.basename(uncompressedName1) name_parts = os.path.splitext(chunkbase1) chunktemplate1 = os.path.join(outdir, name_parts[0] + ".%05d" + name_parts[1]) uncompressedName2 = None batonfile2 = None chunkbase2 = None chunktemplate2 = None if filename2 is not None: uncompressedName2 = textDataConditioner.getUncompressedBaseName( filename2) batonfile2 = os.path.join( outdir, "%s.chunk_stats$" % os.path.basename(uncompressedName2)) chunkbase2 = os.path.basename(uncompressedName2) name_parts = os.path.splitext(chunkbase2) chunktemplate2 = os.path.join(outdir, name_parts[0] + ".%05d" + name_parts[1]) split_logfile = os.path.join(outdir, "split_processing.log") # fork a process to kick off split of file 1 try: split_command = [ "kseq_split", "-f", batonfile1, "-o", outformat, filename1, str(chunksize), chunktemplate1 ] if samplerate is not None: split_command = [ "kseq_split", "-f", batonfile1, "-o", outformat, "-s", str(samplerate), filename1, str(chunksize), chunktemplate1 ] caller.logWriter.info( "fast input conditioner forking split process : %s" % " ".join(split_command)) me = os.fork() if me == 0: # child mypid = os.getpid() # kick off the splitting process and wait for it to finish proc = subprocess.Popen(split_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate() with open(split_logfile, "a") as l: print >> l, "child process %d started split subprocess %d" % ( mypid, proc.pid) print >> l, "split subprocess stdout : \n%s" % stdout print >> l, "split subprocess stderr : \n%s" % stderr print >> l, "split subprocess %d terminated with return value %d" % ( proc.pid, proc.returncode) print >> l, "child process %d exiting with status %s" % ( mypid, proc.returncode) sys.exit(proc.returncode) except OSError, e: caller.logWriter.info( "fast input conditioner : error - fork of %s failed with OSError : %s" % (" ".join(split_command), e)) raise tutils.tardisException( "fast input conditioner : error - fork of %s failed with OSError : %s" % (" ".join(split_command), e))
"fast input conditioner : error - fork of %s failed with OSError : %s" % (" ".join(split_command), e)) raise tutils.tardisException( "fast input conditioner : error - fork of %s failed with OSError : %s" % (" ".join(split_command), e)) # parent if filename2 != None: try: split_command = [ "kseq_split", "-f", batonfile2, "-o", outformat, filename2, str(chunksize), chunktemplate2 ] if samplerate is not None: raise tutils.tardisException( "this input conditioner does not support paired random sampling ! - should not be executing this code block !?" ) caller.logWriter.info( "fast input conditioner forking split process : %s" % " ".join(split_command)) me = os.fork() if me == 0: # child mypid = os.getpid() # kick off the splitting process and wait for it to finish proc = subprocess.Popen(split_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc.communicate()
def runCommand(self, argCommand=None): command = argCommand if argCommand is None: command = self.command if len(command) > 0: self.logWriter.info("localhpcJob : running %s" % str(command)) # set up the shell scriptfile(s) (one per chunk) (unless this is a rerun in which case its already been done) if self.submitCount == 0: runtime_environmentcode = self.runtime_config_template.safe_substitute( ) # currently no templating actually done here shellcode = self.shell_script_template.safe_substitute(configure_runtime_environment=runtime_environmentcode,\ hpcdir=self.workingRoot,command=string.join(self.command," "),\ startdir=self.controller.options["startdir"],\ input_conditioning=str(self.controller.options["input_conditioning"])) self.scriptfilename = os.path.join(self.workingRoot, "run%d.sh" % self.jobNumber) if os.path.isfile(self.scriptfilename): raise tutils.tardisException("error %s already exists" % self.scriptfilename) f = open(self.scriptfilename, "w") self.logWriter.info( "localhpcJob : local shell script wrapper is %s" % self.scriptfilename) f.writelines(shellcode) f.close() os.chmod( self.scriptfilename, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH) self.stdoutfilename = "%s.stdout" % self.scriptfilename self.stderrfilename = "%s.stderr" % self.scriptfilename self.stdoutnamepattern = os.path.basename(self.stdoutfilename) self.stderrnamepattern = os.path.basename(self.stderrfilename) self.logname = "%s.log" % self.scriptfilename self.submitCount += 1 # launch the job if we can. # we can launch the job if jobs running < max_processes # first update process statuses self.waitOnChildren() running_processes = self.getRunningProcesses() #self.logWriter.info("running jobs : %s"%str(running_processes)) #jobs_running = hpcConditioner.getJobSubmittedCount() - hpcConditioner.getResultsSentCount() if len(running_processes ) < self.controller.options["max_processes"]: self.logWriter.info("localhpcJob : launching %s" % self.scriptfilename) self.jobHeld = False else: self.logWriter.info( "localhpcJob : not launching %s (jobs_running = %s)" % (self.scriptfilename, str(running_processes))) #self.logWriter.info("DEBUGx : job list is %s"%str(hpcConditioner.jobList)) #self.logWriter.info("DEBUGx : job submit counts are %s"%str([j.submitCount for j in hpcConditioner.jobList])) #self.logWriter.info("DEBUGx : jobs sent are %d"%hpcConditioner.getResultsSentCount()) self.jobHeld = True return local_submit = [self.scriptfilename] if self.controller.options["dry_run"]: self.logWriter.info( "localhpcJob : this is a dry run - not launching the job") else: self.logWriter.info("localhpcJob : forking to execute %s" % str(local_submit)) self.jobHeld = False # now fork - if we are the parent, return, if we are the child execute the job and the exit # before forking, however, do an asynchronous waitpid to clean up defunct processes # (currently we don't do anything with these results - we just want to # do a wait so that the child processes can be removed from the process table #self.logWriter.info("localhpcJob : checking waits") #try: # pidresults = os.waitpid(0, os.WNOHANG) # self.logWriter.info("wait returned : %s"%str(pidresults)) # returns [(pid, status), (pid,status),...] # pidresultsDict = dict([(pidresult[0], pidresult) for pidresult in [pidresults]]) # self.workerList.update(pidresultsDict) #except OSError as inst: # if inst.errno == 10: # self.logWriter.info("(no child processes)") # else: # self.logWriter.info("(unknown OSError - re-raising)") # raise inst try: me = os.fork() if me == 0: mypid = os.getpid() with open(self.logname, "w") as l: print >> l, "job starting pid %d" % mypid fstdout = open(self.stdoutfilename, "w") fstderr = open(self.stderrfilename, "w") self.proc = subprocess.Popen(local_submit, stdout=fstdout, stderr=fstderr) self.proc.communicate() fstdout.close() fstderr.close() self.submitreturncode = self.proc.returncode self.logWriter.info( "localhpcJob : %s (pid %d) has returned (status %s)" % (str(local_submit), os.getpid(), self.submitreturncode)) self.logWriter.info( "localhpcJob : stdout was written to %s" % self.stdoutfilename) self.logWriter.info( "localhpcJob : stderr was written to %s" % self.stderrfilename) self.logWriter.info("localhpcJob : child %d exiting" % os.getpid()) with open(self.logname, "a") as l: print >> l, "pid %d job terminated return value %d" % ( os.getpid(), self.submitreturncode) sys.exit(0) else: self.workerList[me] = (0, 0) self.submitCount += 1 self.submitreturncode = 0 self.logWriter.info("localhpcJob : parent returning") return except OSError, e: self.logWriter.info( "localhpcJob : warning - fork of %s failed with OSError : %s" % (self.scriptfilename, e)) self.logWriter.info("localhpcJob : job %s held " % self.scriptfilename) self.jobHeld = True
def get_templates(self,default_job_template_name, default_shell_template_name, default_runtime_config_template_name): """ this method examines the run-time arguments supplied to tardis, and from these figures out a job template, shell template and runtime config template. The job template is used to create a job file for the scheduler (e.g. slurm) , for each job to be launched The shell template is used to create a wrapper shell (i.e. run1.sh, run2.sh etc) , for each task The runtime config template is used to generate source to be included in the wrapper shell - i.e. in run1.sh, run2.sh etc. The run time arguments examined by this method specify whether the user wants to a) use one of the named, hard-coded (in tutils.py) templates b) supply the name of a file containing templating c) don't supply either a or b , in which case a default is used. (if they specify both a and b, this is an error) """ (job_template, shell_script_template, runtime_config_template) = (None, None, None) if self.controller.options is not None: # figure out a job template from the options. (You can specify one of the inbuilt templates by name, or # supply a file containing a custom template) job_template_name = self.controller.options.get("job_template_name",None) job_template_filename = self.controller.options.get("jobtemplatefile",None) # use default if there is one and its needed if default_job_template_name is not None: if job_template_name is None and job_template_filename is None: #use the default job template job_template_name = default_job_template_name # check we have at least named template or template file but not both if job_template_name is not None and job_template_filename is not None: raise tutils.tardisException("error both job_template_name (%s) and job_template_filename (%s) defined - only define one of these"%(job_template_name,job_template_filename) ) elif job_template_name is None and job_template_filename is None: raise tutils.tardisException("error neither job_template_name nor job_template_filename are defined (and no default available") if job_template_name is not None: job_template = tutils.getTemplateContent(self.controller.options, job_template_name, logWriter=self.logWriter) else: if not os.path.isfile(job_template_filename): raise tutils.tardisException("error job template file %s not found"%job_template_filename ) job_template = string.join(file(job_template_filename,"r"),"") if job_template is None: raise tutils.tardisException("hpcJob: Error job template is null after templating") job_template = string.Template(job_template) # figure out a shell template from the options. (You can specify one of the inbuilt templates by name, or # supply a file containing a custom template) shell_template_name = self.controller.options.get("shell_template_name",None) shell_template_filename = self.controller.options.get("shelltemplatefile",None) if shell_template_name is None and shell_template_filename is None: #use the default local shell template shell_template_name = default_shell_template_name if shell_template_name is not None and shell_template_filename is not None: raise tutils.tardisException("error both shell_template_name (%s) and shell_template_filename (%s) defined - only define one of these"%(shell_template_name,shell_template_filename) ) if shell_template_name is not None: shell_script_template = tutils.getTemplateContent(self.controller.options, shell_template_name, logWriter=self.logWriter) else: shell_script_template = string.join(file(shell_template_filename,"r"),"") if shell_script_template is None: raise tutils.tardisException("hpcJob : Error shell template is null after templating") shell_script_template = string.Template(shell_script_template) # figure out run-time configuration code (You can specify one of the inbuilt configs by name, or # supply a file containing a custom config) runtime_config_template_name = self.controller.options.get("runtime_config_name",None) runtime_config_template_filename = self.controller.options.get("runtimeconfigsourcefile",None) # use default if available and needed. Note this logic means that if you supply a run-time config, then the # default will not be used - so if for example the default loads a base env, thne if you supply your own , # you will need to explicitly load the base before doing your own # this is based on the assumption that its easier to do than to undo if default_runtime_config_template_name is not None: if runtime_config_template_name is None and runtime_config_template_filename is None: #use the default - for example this might load a deafult conda env or load a default module (site dependent) runtime_config_template_name = default_runtime_config_template_name # don't want both named, and a file if runtime_config_template_name is not None and runtime_config_template_filename is not None: raise tutils.tardisException("error both runtime_config_template_name (%s) and runtime_config_template_filename (%s) defined - only define one of these"%(runtime_config_template_name,runtime_config_template_filename) ) if runtime_config_template_name is not None: runtime_config_template = tutils.getTemplateContent(self.controller.options, runtime_config_template_name, logWriter=self.logWriter) else: runtime_config_template = string.join(file(runtime_config_template_filename,"r"),"") if runtime_config_template is None: raise tutils.tardisException("hpcJob : Error config template is null after templating") runtime_config_template = string.Template(runtime_config_template) return (job_template, shell_script_template, runtime_config_template)