def main(): """ KBase Convert task manager for converting between KBase objects. Step 1 - Run a converter to pull the source object and save the destination object. Args: workspace_service_url: URL for a KBase Workspace service where KBase objects are stored. ujs_service_url: URL for a User and Job State service to report task progress back to the user. shock_service_url: URL for a KBase SHOCK data store service for storing files and large reference data. handle_service_url: URL for a KBase Handle service that maps permissions from the Workspace to SHOCK for KBase types that specify a Handle reference instead of a SHOCK reference. source_workspace_name: The name of the source workspace. destination_workspace_name: The name of the destination workspace. source_object_name: The source object name. destination_object_name: The destination object name. source_kbase_type: The KBase Workspace type string that indicates the module and type of the object being created. destination_kbase_type: The KBase Workspace type string that indicates the module and type of the object being created. optional_arguments: This is a JSON string containing optional parameters that can be passed in for custom behavior per conversion. ujs_job_id: The job id from the User and Job State service that can be used to report status on task progress back to the user. job_details: This is a JSON string that passes in the script specific command line options for a given conversion type. The service pulls these config settings from a script config created by the developer of the conversion script and passes that into the AWE job that calls this script. working_directory: The working directory on disk where files can be created and will be cleaned when the job ends with success or failure. keep_working_directory: A flag to tell the script not to delete the working directory, which is mainly for debugging purposes. Returns: Literal return value is 0 for success and 1 for failure. Actual data output is one or more Workspace objects saved to a user's workspace. Authors: Matt Henderson, Gavin Price """ logger = script_utils.stderrlogger(__file__, level=logging.DEBUG) logger.info("Executing KBase Convert tasks") script_details = script_utils.parse_docs(main.__doc__) logger.debug(script_details["Args"]) parser = script_utils.ArgumentParser(description=script_details["Description"], epilog=script_details["Authors"]) # provided by service config parser.add_argument('--workspace_service_url', help=script_details["Args"]["workspace_service_url"], action='store', required=True) parser.add_argument('--ujs_service_url', help=script_details["Args"]["ujs_service_url"], action='store', required=True) # optional because not all KBase Workspace types contain a SHOCK or Handle reference parser.add_argument('--shock_service_url', help=script_details["Args"]["shock_service_url"], action='store', default=None) parser.add_argument('--handle_service_url', help=script_details["Args"]["handle_service_url"], action='store', default=None) # workspace info for pulling the data parser.add_argument('--source_workspace_name', help=script_details["Args"]["source_workspace_name"], action='store', required=True) parser.add_argument('--source_object_name', help=script_details["Args"]["source_object_name"], action='store', required=True) # workspace info for saving the data parser.add_argument('--destination_workspace_name', help=script_details["Args"]["destination_workspace_name"], action='store', required=True) parser.add_argument('--destination_object_name', help=script_details["Args"]["destination_object_name"], action='store', required=True) # the types that we are transforming between, currently assumed one to one parser.add_argument('--source_kbase_type', help=script_details["Args"]["source_kbase_type"], action='store', required=True) parser.add_argument('--destination_kbase_type', help=script_details["Args"]["destination_kbase_type"], action='store', required=True) # any user options provided, encoded as a jason string parser.add_argument('--optional_arguments', help=script_details["Args"]["optional_arguments"], action='store', default='{}') # Used if you are restarting a previously executed job? parser.add_argument('--ujs_job_id', help=script_details["Args"]["ujs_job_id"], action='store', default=None, required=False) # config information for running the validate and transform scripts parser.add_argument('--job_details', help=script_details["Args"]["job_details"], action='store', default=None) # the working directory is where all the files for this job will be written, # and normal operation cleans it after the job ends (success or fail) parser.add_argument('--working_directory', help=script_details["Args"]["working_directory"], action='store', default=None, required=True) parser.add_argument('--keep_working_directory', help=script_details["Args"]["keep_working_directory"], action='store_true') # ignore any extra arguments args, unknown = parser.parse_known_args() kb_token = os.environ.get('KB_AUTH_TOKEN') ujs = UserAndJobState(url=args.ujs_service_url, token=kb_token) est = datetime.datetime.utcnow() + datetime.timedelta(minutes=3) if args.ujs_job_id is not None: ujs.update_job_progress(args.ujs_job_id, kb_token, "KBase Data Convert started", 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) # parse all the json strings from the argument list into dicts # TODO had issues with json.loads and unicode strings, workaround was using simplejson and base64 args.optional_arguments = simplejson.loads(base64.urlsafe_b64decode(args.optional_arguments)) args.job_details = simplejson.loads(base64.urlsafe_b64decode(args.job_details)) if not os.path.exists(args.working_directory): os.mkdir(args.working_directory) if args.ujs_job_id is not None: ujs.update_job_progress(args.ujs_job_id, kb_token, "Converting from {0} to {1}".format(args.source_kbase_type,args.destination_kbase_type), 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) # Step 1 : Convert the objects try: logger.info(args) convert_args = args.job_details["transform"] convert_args["optional_arguments"] = args.optional_arguments convert_args["working_directory"] = args.working_directory convert_args["workspace_service_url"] = args.workspace_service_url convert_args["source_workspace_name"] = args.source_workspace_name convert_args["source_object_name"] = args.source_object_name convert_args["destination_workspace_name"] = args.destination_workspace_name convert_args["destination_object_name"] = args.destination_object_name logger.info(convert_args) task_output = handler_utils.run_task(logger, convert_args) if task_output["stdout"] is not None: logger.debug("STDOUT : " + str(task_output["stdout"])) if task_output["stderr"] is not None: logger.debug("STDERR : " + str(task_output["stderr"])) except Exception, e: handler_utils.report_exception(logger, {"message": 'ERROR : Conversion from {0} to {1}'.format(args.source_kbase_type,args.destination_kbase_type), "exc": e, "ujs": ujs, "ujs_job_id": args.ujs_job_id, "token": kb_token, }, {"keep_working_directory": args.keep_working_directory, "working_directory": args.working_directory}) ujs.complete_job(args.ujs_job_id, kb_token, "Convert to {0} failed.".format( args.destination_workspace_name), str(e), None)
def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log( log.INFO, 'Job ' + job['id'] + ' running with work folder ' + self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file, result: %d" % (result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' % (self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file: %s" % (e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" % (sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError( "There are not enough sequence files that meet the sequence length or number of sequences criteria." ) # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError( "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress( job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p, q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError( "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile + '.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError( "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename( csvFile, '%s/%s.csv' % (self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError( "Error saving distance matrix file to Shock. A Shock node was not created." ) # Mark the job as complete. results = { 'shocknodes': [node['id']], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully') # Cleanup after ourselves. self._cleanup() return
## main loop args.opt_args = json.loads(args.opt_args) if 'uploader' not in args.opt_args: args.opt_args['uploader'] = {} args.opt_args['uploader']['file'] = args.otmp args.opt_args['uploader']['input'] = args.inobj_id args.opt_args['uploader']['jid'] = args.jid args.opt_args['uploader']['etype'] = args.etype uploader = Uploader(args) try: uploader.download_shock_data() except: if args.jid is not None: e, v, t = sys.exc_info()[:3] ujs.complete_job(args.jid, kb_token, 'Failed : data download from Shock\n', str(v), {}) else: traceback.print_exc(file=sys.stderr) exit(3) if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) try: uploader.validation_handler() except: if args.jid is not None: e, v = sys.exc_info()[:2] ujs.complete_job(args.jid, kb_token, 'Failed : data validation\n', str(v), {})
os.chdir(current_directory) except Exception, e: logger.debug("Caught exception during transformation step!") os.chdir(current_directory) if ujs_job_id is not None: error_object["status"] = "ERROR : Transformation from KBase type to External type failed - {0}".format(e.message)[:handler_utils.UJS_STATUS_MAX] error_object["error_message"] = traceback.format_exc() handler_utils.report_exception(logger, error_object, cleanup_details) ujs.complete_job(ujs_job_id, kb_token, "Transform from {0} failed.".format(workspace_name), traceback.format_exc(), None) sys.exit(1) else: logger.error("Conversion of data to workspace object") logger.error("Download from {0} failed.".format(workspace_name)) raise # Report progress on success of the download step if ujs_job_id is not None: ujs.update_job_progress(ujs_job_id, kb_token, "Workspace objects transformed to {0}".format(external_type)[:handler_utils.UJS_STATUS_MAX], 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) else: logger.info("Workspace objects transformed to {0}".format(external_type))
est = datetime.datetime.utcnow() + datetime.timedelta(minutes=3) if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Dispatched', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) # main loop args.opt_args = json.loads(args.opt_args) validator = Validator(args) try: validator.download_shock_data() except: e,v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job(args.jid, kb_token, 'Failed : data download from Shock\n{}:{}'.format(str(e),str(v)), str(e), {}) else: traceback.print_exc(file=sys.stderr) print sys.stderr, 'Failed : data download from Shock\n{}:{}'.format(str(e),str(v)) exit(3); if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) try: validator.validation_handler() except: e,v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job(args.jid, kb_token, 'Failed : data validation\n{}:{}'.format(str(e),str(v)), str(e), {}) else:
# if args.jid is not None: # e,v,t = sys.exc_info()[:3] # ujs.complete_job(args.jid, kb_token, 'Failed : data download from Workspace\n', str(v), {}) # else: # traceback.print_exc(file=sys.stderr) # exit(3); #if args.jid is not None: # ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) try: downloader.download_handler() except: if args.jid is not None: e,v = sys.exc_info()[:2] ujs.complete_job(args.jid, kb_token, 'Failed : data conversion\n', str(v), {}) else: traceback.print_exc(file=sys.stderr) exit(4); if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data converted', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) result = {} try: result = downloader.upload_to_shock() except: e,v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job(args.jid, kb_token, 'Failed : data upload to shock\n', str(v), {}) else:
if task_output["stdout"] is not None: logger.debug("STDOUT : " + str(task_output["stdout"])) if task_output["stderr"] is not None: logger.debug("STDERR : " + str(task_output["stderr"])) except Exception, e: if args.ujs_job_id is not None: error_object["status"] = "ERROR : Conversion between KBase Types failed - {0}".format(e.message)[:handler_utils.UJS_STATUS_MAX] error_object["error_message"] = traceback.format_exc() handler_utils.report_exception(logger, error_object, cleanup_details) ujs.complete_job(args.ujs_job_id, kb_token, "Convert from {0} failed.".format(args.source_workspace_name), traceback.format_exc(), None) sys.exit(1) else: logger.error("Conversion between workspace objects failed") logger.error("Convert from {0} failed.".format(args.source_workspace_name)) raise # Report progress on the overall task being completed if args.ujs_job_id is not None: ujs.complete_job(args.ujs_job_id, kb_token, "Convert to {0} completed".format(args.destination_workspace_name), None,
# if args.jid is not None: # e,v,t = sys.exc_info()[:3] # ujs.complete_job(args.jid, kb_token, 'Failed : data download from Workspace\n', str(v), {}) # else: # traceback.print_exc(file=sys.stderr) # exit(3); #if args.jid is not None: # ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) try: downloader.download_handler() except: if args.jid is not None: e, v = sys.exc_info()[:2] ujs.complete_job(args.jid, kb_token, 'Failed : data conversion\n', str(v), {}) else: traceback.print_exc(file=sys.stderr) exit(4) if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data converted', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) result = {} try: result = downloader.upload_to_shock() except: e, v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job(args.jid, kb_token,
if task_output["stderr"] is not None: logger.debug("STDERR : " + str(task_output["stderr"])) except Exception, e: if args.ujs_job_id is not None: error_object[ "status"] = "ERROR : Conversion between KBase Types failed - {0}".format( e.message)[:handler_utils.UJS_STATUS_MAX] error_object["error_message"] = traceback.format_exc() handler_utils.report_exception(logger, error_object, cleanup_details) ujs.complete_job( args.ujs_job_id, kb_token, "Convert from {0} failed.".format( args.source_workspace_name), traceback.format_exc(), None) sys.exit(1) else: logger.error("Conversion between workspace objects failed") logger.error("Convert from {0} failed.".format( args.source_workspace_name)) raise # Report progress on the overall task being completed if args.ujs_job_id is not None: ujs.complete_job( args.ujs_job_id, kb_token, "Convert to {0} completed".format( args.destination_workspace_name), None, { "shocknodes": [],
if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Dispatched', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) # main loop args.opt_args = json.loads(args.opt_args) validator = Validator(args) try: validator.download_shock_data() except: e, v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job( args.jid, kb_token, 'Failed : data download from Shock\n{}:{}'.format( str(e), str(v)), str(e), {}) else: traceback.print_exc(file=sys.stderr) print sys.stderr, 'Failed : data download from Shock\n{}:{}'.format( str(e), str(v)) exit(3) if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) try: validator.validation_handler() except: e, v = sys.exc_info()[:2]
def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.") # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p,q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile+'.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.") # Mark the job as complete. results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job '+job['id']+' completed successfully') # Cleanup after ourselves. self._cleanup() return
else: logger.info( "Data is in a KBase format and objects saved to {0}". format(workspace_name)) # Report progress on the overall task being completed if ujs_job_id is not None: ujs.complete_job( ujs_job_id, kb_token, "Upload to {0} completed".format( workspace_name)[:handler_utils.UJS_STATUS_MAX], None, { "shocknodes": [], "shockurl": shock_service_url, "workspaceids": [], "workspaceurl": workspace_service_url, "results": [{ "server_type": "Workspace", "url": workspace_service_url, "id": "{}/{}".format(workspace_name, object_name), "description": "description" }] }) else: logger.info("Upload to {0} completed".format(workspace_name)) # Almost done, remove the working directory if possible if not keep_working_directory: handler_utils.cleanup(logger, working_directory)
ujs.update_job_progress(ujs_job_id, kb_token, 'Data is in a KBase format and objects saved to {0}'.format(workspace_name)[:handler_utils.UJS_STATUS_MAX], 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) else: logger.info("Data is in a KBase format and objects saved to {0}".format(workspace_name)) # Report progress on the overall task being completed if ujs_job_id is not None: ujs.complete_job(ujs_job_id, kb_token, "Upload to {0} completed".format(workspace_name)[:handler_utils.UJS_STATUS_MAX], None, {"shocknodes" : [], "shockurl" : shock_service_url, "workspaceids" : [], "workspaceurl" : workspace_service_url, "results" : [{"server_type" : "Workspace", "url" : workspace_service_url, "id" : "{}/{}".format(workspace_name, object_name), "description" : "description"}]}) else: logger.info("Upload to {0} completed".format(workspace_name)) # Almost done, remove the working directory if possible if not keep_working_directory: handler_utils.cleanup(logger, working_directory) except Exception, e: if ujs is None or ujs_job_id is None:
est = datetime.datetime.utcnow() + datetime.timedelta(minutes=3) if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Dispatched', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000')) # main loop args.optional_arguments = json.loads(args.optional_arguments) validator = Validator(args) try: validator.validation_handler() except: e,v = sys.exc_info()[:2] if args.jid is not None: ujs.complete_job(args.jid, kb_token, 'Failed : data validation\n{}:{}'.format(str(e),str(v)), str(e), {}) else: traceback.print_exc(file=sys.stderr) print sys.stderr, 'Failed : data validation\n{}:{}'.format(str(e),str(v)) sys.exit(1); # clean-up if args.delete_working_directory: try : shutil.rmtree("{}".format(args.working_directory)) except: pass if args.jid is not None: ujs.complete_job(args.ujs_job_id, kb_token, 'Success', None, None) sys.exit(0);
#! /usr/bin/python import argparse import sys import os import json import traceback from biokbase.probabilistic_annotation.Worker import ProbabilisticAnnotationWorker from biokbase.userandjobstate.client import UserAndJobState if __name__ == "__main__": parser = argparse.ArgumentParser(prog='pa-runjob') parser.add_argument('jobDirectory', help='path to job directory for the job', action='store', default=None) args = parser.parse_args() # Run the job. jobDataPath = os.path.join(args.jobDirectory, "jobdata.json") job = json.load(open(jobDataPath, 'r')) try: worker = ProbabilisticAnnotationWorker() worker.runAnnotate(job) except Exception as e: # Mark the job as failed. tb = traceback.format_exc() sys.stderr.write(tb) ujsClient = UserAndJobState(job['config']['userandjobstate_url'], token=job['context']['token']) ujsClient.complete_job(job['id'], job['context']['token'], 'failed', tb, { }) exit(0)
## main loop args.opt_args = json.loads(args.opt_args) if 'uploader' not in args.opt_args: args.opt_args['uploader'] = {} args.opt_args['uploader']['file'] = args.otmp args.opt_args['uploader']['input'] = args.inobj_id args.opt_args['uploader']['jid'] = args.jid args.opt_args['uploader']['etype'] = args.etype uploader = Uploader(args) try: uploader.download_shock_data() except: if args.jid is not None: e,v,t = sys.exc_info()[:3] ujs.complete_job(args.jid, kb_token, 'Failed : data download from Shock\n', str(v), {}) else: traceback.print_exc(file=sys.stderr) exit(3); if args.jid is not None: ujs.update_job_progress(args.jid, kb_token, 'Data downloaded', 1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') ) try: uploader.validation_handler() except: if args.jid is not None: e,v = sys.exc_info()[:2] ujs.complete_job(args.jid, kb_token, 'Failed : data validation\n', str(v), {}) else: traceback.print_exc(file=sys.stderr)
import os import json import traceback from biokbase.probabilistic_annotation.Worker import ProbabilisticAnnotationWorker from biokbase.userandjobstate.client import UserAndJobState if __name__ == "__main__": parser = argparse.ArgumentParser(prog='pa-runjob') parser.add_argument('jobDirectory', help='path to job directory for the job', action='store', default=None) args = parser.parse_args() # Run the job. jobDataPath = os.path.join(args.jobDirectory, "jobdata.json") job = json.load(open(jobDataPath, 'r')) try: worker = ProbabilisticAnnotationWorker() worker.runAnnotate(job) except Exception as e: # Mark the job as failed. tb = traceback.format_exc() sys.stderr.write(tb) ujsClient = UserAndJobState(job['config']['userandjobstate_url'], token=job['context']['token']) ujsClient.complete_job(job['id'], job['context']['token'], 'failed', tb, {}) exit(0)
def runAnnotate(self, job): ''' Run an annotate job to create a ProbAnno typed object. A ProbAnno typed object is created in four steps: (1) extract amino acid sequences from a Genome typed object to a fasta file, (2) run a BLAST search using the amino acid sequences against the subsystem BLAST database, (3) calculate annotation likelihood scores for each roleset implied by the functions of proteins in subsystems, and (4) save the likelihood scores to a ProbAnno typed object. The Job dictionary contains three main sections: (1) input parameters to the annotate() function, (2) context of server instance running the annotate() function, and (3) config variables of server. @param job Job dictionary created by server's annotate() function @return Nothing (although job is marked as complete) ''' # The input parameters and user context for annotate() were stored in the job data for the job. input = job["input"] if input['verbose']: self.logger.set_log_level(log.DEBUG) self.ctx = job["context"] self.config = job['config'] # Create a DataParser object for working with the static database files. self.dataParser = DataParser(self.config) status = None try: # Make sure the database files are available. self.dataParser.checkIfDatabaseFilesExist() # Make sure the job directory exists. workFolder = make_job_directory(self.config['work_folder_path'], job['id']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.ctx['token']) # Get the Genome object from the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'getting genome object', 1, timestamp(3600)) except: pass wsClient = Workspace(self.config["workspace_url"], token=self.ctx['token']) genomeObjectId = make_object_identity(input["genome_workspace"], input["genome"]) objectList = wsClient.get_objects( [ genomeObjectId ] ) genomeObject = objectList[0] # Convert Genome object to fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'converting Genome object to fasta file', 1, timestamp(3600)) except: pass fastaFile = self._genomeToFasta(input, genomeObject, workFolder) # Run blast using the fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'running blast', 1, timestamp(3600)) except: pass blastResultFile = self._runBlast(input, fastaFile, workFolder) # Calculate roleset probabilities. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'calculating roleset probabilities', 1, timestamp(300)) except: pass rolestringTuples = self._rolesetProbabilitiesMarble(input, blastResultFile, workFolder) # Build ProbAnno object and store in the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'building ProbAnno object', 1, timestamp(120)) except: pass output = self._buildProbAnnoObject(input, genomeObject, blastResultFile, rolestringTuples, workFolder, wsClient) # Mark the job as done. status = "done" tb = None self._log(log.INFO, 'Job '+job['id']+' finished for genome '+input['genome']+' to probanno '+input['probanno']) except: tb = traceback.format_exc() sys.stderr.write('\n'+tb) status = "failed" self._log(log.ERR, 'Job '+job['id']+' failed for genome '+input['genome']+' to probanno '+input['probanno']) # Mark the job as complete with the given status. ujsClient.complete_job(job['id'], self.ctx['token'], status, tb, { }) # Remove the temporary work directory. if self.logger.get_log_level() < log.DEBUG2 and status == 'done': try: shutil.rmtree(workFolder) except OSError: # For some reason deleting the directory was failing in production. Rather than have all jobs look like they failed # I catch and log the exception here (since the user still gets the same result if the directory remains intact) msg = 'Unable to delete temporary directory %s\n' %(workFolder) sys.stderr.write('WARNING: '+msg) self._log(log.WARNING, msg) return