def loadReadsSet(self): if hasattr(self.__class__, 'reads_set_ref'): return self.__class__.reads_set_ref pe_reads_ref = self.pe_reads_ref reads_set_name = 'TophatTestReadsSet' # create the set object reads_set_data = { 'description': 'Reads Set for testing Bowtie', 'items': [{ 'ref': pe_reads_ref, 'label': 'rs1' }, { 'ref': pe_reads_ref, 'label': 'rs2' }, { 'ref': pe_reads_ref, 'label': 'rs3' }] } # test a save set_api = SetAPI(self.srv_wiz_url, service_ver='dev') res = set_api.save_reads_set_v1({ 'data': reads_set_data, 'output_object_name': reads_set_name, 'workspace': self.getWsName() }) reads_set_ref = res['set_ref'] print('Loaded ReadsSet: ' + reads_set_ref) return reads_set_ref
def __init__(self, config, ctx): self.ctx = ctx self.scratch = os.path.abspath(config['scratch']) self.ws_url = config['workspace-url'] self.serviceWizardURL = config['srv-wiz-url'] self.callbackURL = config['SDK_CALLBACK_URL'] if not os.path.exists(self.scratch): os.makedirs(self.scratch) self.SE_flag = 'SE' self.PE_flag = 'PE' SERVICE_VER = 'release' # readsUtils_Client try: self.readsUtils_Client = ReadsUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate readsUtils_Client with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI self.setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e))
def fetch_reads_refs_from_sampleset(self, ref, info, validated_params): """ Note: adapted from kbaseapps/kb_hisat2 - file_util.py From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = self.get_type_from_obj_info(info) refs = list() refs_for_ws_info = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_api = SetAPI(self.srv_wiz_url) reads_set = set_api.get_reads_set_v1({'ref': ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) for reads in reads_set["data"]["items"]: refs.append({'ref': reads['ref_path'], 'condition': reads['label'] }) refs_for_ws_info.append({'ref': reads['ref_path']}) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) # get object info so we can name things properly infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos'] name_ext = '_alignment' if 'output_alignment_suffix' in validated_params \ and validated_params['output_alignment_suffix'] is not None: ext = validated_params['output_alignment_suffix'].replace(' ', '') if ext: name_ext = ext unique_name_lookup = {} for k in range(0, len(refs)): refs[k]['info'] = infos[k] name = infos[k][1] if name not in unique_name_lookup: unique_name_lookup[name] = 1 else: unique_name_lookup[name] += 1 name = name + '_' + str(unique_name_lookup[name]) name = name + name_ext refs[k]['alignment_output_name'] = name return refs
def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc']
def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.au = AssemblyUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url, service_ver='beta') self.scratch = scratch_dir self.working_dir = scratch_dir self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch) self.provenance = provenance self.ws_client = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url, service_ver='beta')
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_Msuite'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_Msuite', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_Msuite(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx) suffix = int(time.time() * 1000) cls.wsName = "test_kb_Msuite_" + str(suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token']) cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev') cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL']) # stage an input and output directory """ cls.input_dir = os.path.join(cls.scratch, 'input_1') cls.output_dir = os.path.join(cls.scratch, 'output_1') cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna') shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir) shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir) shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta) """ # prepare WS data cls.prepare_data()
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ke_util = kb_ke_util(self.callback_url, service_ver="dev") self.gen_api = GenericsAPI(self.callback_url, service_ver="dev") self.ws = Workspace(self.ws_url, token=self.token) self.set_client = SetAPI(self.srv_wiz_url)
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.scratch = config['scratch'] self.srv_wiz_url = config['srv-wiz-url'] self.ws = Workspace(self.ws_url, token=self.token) self.bt = kb_Bowtie2(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.dfu = DataFileUtil(self.callback_url) self.set_client = SetAPI(self.srv_wiz_url)
def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.srv_wiz_url = config['srv-wiz-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.gfu = GenomeFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.eu = ExpressionUtils(self.callback_url, service_ver='dev') self.ws = Workspace(self.ws_url, token=self.token) self.set_client = SetAPI(self.srv_wiz_url)
def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_kaiju'): cls.cfg[nameval[0]] = nameval[1] # Getting username from Auth profile for token authServiceUrl = cls.cfg['auth-service-url'] auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({ 'token': token, 'user_id': user_id, 'provenance': [{ 'service': 'kb_kaiju', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1 }) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL) cls.serviceImpl = kb_kaiju(cls.cfg) cls.scratch = cls.cfg['scratch'] cls.callback_url = os.environ['SDK_CALLBACK_URL'] suffix = int(time.time() * 1000) cls.wsName = "test_kb_kaiju_" + str(suffix) cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName}) cls.kaiju_runner = KaijuUtil(cls.cfg, cls.ctx) cls.ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token']) # prepare WS data cls.prepare_data()
def __init__(self, config, provenance): self.config = config self.workspace_url = config['workspace-url'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.scratch = config['scratch'] self.srv_wiz_url = config['srv-wiz-url'] self.parallel_runner = KBParallel(self.callback_url) self.provenance = provenance self.star_utils = STARUtils(self.scratch, self.workspace_url, self.callback_url, self.srv_wiz_url, provenance) self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.star_idx_dir = None self.star_out_dir = None # from the provenance, extract out the version to run by exact hash if possible self.my_version = 'release' if len(provenance) > 0: if 'subactions' in provenance[0]: self.my_version = self.get_version_from_subactions( 'kb_STAR', provenance[0]['subactions']) print('Running kb_STAR version = ' + self.my_version)
def download_short_unpaired(self, console, token, wsname, short_unpaired_libraries): try: self.log(console, "Getting short unpaired reads.\n") ruClient = ReadsUtils(url=self.callbackURL, token=token) # first, unpack any ReadsSets into the actual SingleEndLibrary referencs reads_refs = [] # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for lib in short_unpaired_libraries: try: obj_id = { 'ref': lib if '/' in lib else (wsname + '/' + lib) } lib_obj_info = wsClient.get_object_info_new( {'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] # remove trailing version lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseSets.ReadsSet': # unpack it try: setAPIClient = SetAPI(url=self.serviceWizardURL, token=token) self.log(console, 'getting reads set ' + lib_ref) readsSet = setAPIClient.get_reads_set_v1({ 'ref': lib_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object: (' + lib_ref + ')\n' + str(e)) for readsLibrary in readsSet['data']['items']: reads_refs.append(readsLibrary['ref']) else: # use other reads objects "as is" reads_refs.append(lib_ref) except Exception as e: raise ValueError('Unable to get read library object: (' + str(lib) + ')' + str(e)) result = ruClient.download_reads({ 'read_libraries': reads_refs, 'interleaved': 'false' }) # combine outputs short_unpaired_path = os.path.join( self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq") self.log(console, "Combining short unpaired reads.\n") for reads_ref in reads_refs: files = result['files'][reads_ref]['files'] if 'fwd' in files: path = files['fwd'] if path.endswith('.gz'): cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path else: cmd = 'cat ' + path + ' >> ' + short_unpaired_path self.log(console, "command: " + cmd) cmdProcess = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) cmdProcess.wait() if cmdProcess.returncode != 0: raise ValueError('Error running ' + cmd) os.remove(path) else: raise ValueError('File ' + reads_ref + ' missing forward reads file') except Exception as e: raise ValueError('Unable to download short unpaired reads\n' + str(e)) return short_unpaired_path
def process_batch_result(self, batch_result, validated_params, reads, input_set_info): n_jobs = len(batch_result['results']) n_success = 0 n_error = 0 ran_locally = 0 ran_njsw = 0 # reads alignment set items items = [] objects_created = [] for k in range(0, len(batch_result['results'])): job = batch_result['results'][k] result_package = job['result_package'] if job['is_error']: n_error += 1 else: n_success += 1 output_info = result_package['result'][0]['output_info'] ra_ref = output_info['upload_results']['obj_ref'] # Note: could add a label to the alignment here? items.append({'ref': ra_ref, 'label': reads[k]['condition']}) objects_created.append({'ref': ra_ref}) if result_package['run_context']['location'] == 'local': ran_locally += 1 if result_package['run_context']['location'] == 'njsw': ran_njsw += 1 # Save the alignment set alignment_set_data = {'description': '', 'items': items} alignment_set_save_params = { 'data': alignment_set_data, 'workspace': validated_params['output_workspace'], 'output_object_name': str(input_set_info[1]) + validated_params['output_obj_name_suffix'] } set_api = SetAPI(self.srv_wiz_url) save_result = set_api.save_reads_alignment_set_v1( alignment_set_save_params) print('Saved ReadsAlignment=') pprint(save_result) objects_created.append({ 'ref': save_result['set_ref'], 'description': 'Set of all reads alignments generated' }) set_name = save_result['set_info'][1] # run qualimap qualimap_report = self.qualimap.run_bamqc( {'input_ref': save_result['set_ref']}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create the report report_text = 'Ran on SampleSet or ReadsSet.\n\n' report_text = 'Created ReadsAlignmentSet: ' + str(set_name) + '\n\n' report_text += 'Total ReadsLibraries = ' + str(n_jobs) + '\n' report_text += ' Successful runs = ' + str(n_success) + '\n' report_text += ' Failed runs = ' + str(n_error) + '\n' report_text += ' Ran on main node = ' + str(ran_locally) + '\n' report_text += ' Ran on remote worker = ' + str(ran_njsw) + '\n\n' print('Report text=') print(report_text) kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({ 'message': report_text, 'objects_created': objects_created, 'report_object_name': 'kb_Bowtie2_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'workspace_name': validated_params['output_workspace'] }) result = { 'report_info': { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } } result['batch_output_info'] = batch_result return result
class STARUtils: STAR_VERSION = 'STAR 2.5.3a' STAR_BIN = '/kb/deployment/bin/STAR' STAR_IDX_DIR = 'STAR_Genome_index' STAR_OUT_DIR = 'STAR_Output' PARAM_IN_WS = 'output_workspace' PARAM_IN_FASTA_FILES = 'genomeFastaFiles' PARAM_IN_OUTFILE_PREFIX = 'outFileNamePrefix' PARAM_IN_STARMODE = 'runMode' PARAM_IN_THREADN = 'runThreadN' PARAM_IN_READS_FILES = 'readFilesIn' PARAM_IN_READS = 'readsset_ref' PARAM_IN_GENOME = 'genome_ref' SET_READS = 'set_reads_refs' def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url, provenance): self.workspace_url = workspace_url self.callback_url = callback_url self.srv_wiz_url = srv_wiz_url self.au = AssemblyUtil(self.callback_url) self.dfu = DataFileUtil(self.callback_url, service_ver='beta') self.scratch = scratch_dir self.working_dir = scratch_dir self.prog_runner = Program_Runner(self.STAR_BIN, self.scratch) self.provenance = provenance self.ws_client = Workspace(self.workspace_url) self.parallel_runner = KBParallel(self.callback_url) self.qualimap = kb_QualiMap(self.callback_url, service_ver='dev') self.set_api_client = SetAPI(self.srv_wiz_url, service_ver='dev') self.eu = ExpressionUtils(self.callback_url, service_ver='beta') def process_params(self, params): """ process_params: checks params passed to run_star method and set default values """ log('Start validating run_star parameters') # check for required parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError(self.PARAM_IN_WS + ' parameter is required') if params.get(self.PARAM_IN_STARMODE, None) is None: params[self.PARAM_IN_STARMODE] = 'alignReads' if params.get(self.PARAM_IN_GENOME, None) is None: raise ValueError(self.PARAM_IN_GENOME + ' parameter is required for generating genome index') if (params.get(self.PARAM_IN_STARMODE, None) is not None and params[self.PARAM_IN_STARMODE] != "genomeGenerate"): if params.get(self.PARAM_IN_READS, None) is None: raise ValueError(self.PARAM_IN_READS + ' parameter is required for reads mapping') if not valid_string(params[self.PARAM_IN_READS], is_ref=True): raise ValueError("Parameter readsset_ref must be a valid Workspace object reference, " "not {}".format(params.get(self.PARAM_IN_READS, None))) if params.get(self.PARAM_IN_THREADN, None) is not None: if not isinstance(params[self.PARAM_IN_THREADN], int): raise ValueError(self.PARAM_IN_HASH_THREADN + ' must be of type int') else: params[self.PARAM_IN_THREADN] = 2 if "alignment_suffix" not in params or not valid_string(params["alignment_suffix"]): raise ValueError("Parameter alignment_suffix must be a valid Workspace object string, " "not {}".format(params.get("alignment_suffix", None))) if params.get(self.PARAM_IN_OUTFILE_PREFIX, None) is not None: if params[self.PARAM_IN_OUTFILE_PREFIX].find('/') != -1: raise ValueError(self.PARAM_IN_OUTFILE_PREFIX + ' cannot contain subfolder(s).') else: params[self.PARAM_IN_OUTFILE_PREFIX] = 'star_' if params.get('create_report', None) is None: params['create_report'] = 0 return self._setDefaultParameters(params) def _setDefaultParameters(self, params_in): """set default for this group of parameters """ params = copy.deepcopy(params_in) if params.get('outFilterType', None) is None: params['outFilterType'] = "\"BySJout\"" if params.get('outFilterMultimapNmax', None) is None: params['outFilterMultimapNmax'] = 20 if params.get('outSAMtype', None) is None: params['outSAMtype'] = 'BAM' if params.get('outSAMattrIHstart', None) is None: params['outSAMattrIHstart'] = 0 if params.get('outSAMstrandField', None) is None: params['outSAMstrandField'] = 'intronMotif' if params.get('outFilterIntronMotifs', None) is None: params['outFilterIntronMotifs'] = 'RemoveNoncanonical' if params.get(self.SET_READS, None) is None: params[self.SET_READS] = self._get_reads_refs_from_setref(params) return params def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir): """ Get data from genome object ref and return the GTF filename (with path) for STAR indexing and mapping. STAR uses the reference annotation to guide assembly and for creating alignment """ log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir)) gfu = GenomeFileUtil(self.callback_url) try: gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref, 'is_gtf': 1, 'target_dir': gtf_file_dir }) except ValueError as egfu: log('GFU getting GTF file raised error:\n') pprint(egfu) return None else:#no exception raised return gfu_ret.get('file_path') def _construct_indexing_cmd(self, params): # STEP 1: construct the command for running `STAR --runMode genomeGenerate...` idx_cmd = [self.STAR_BIN] idx_cmd.append('--genomeDir') idx_cmd.append(params[self.STAR_IDX_DIR]) idx_cmd.append('--' + self.PARAM_IN_STARMODE) idx_cmd.append('genomeGenerate') idx_cmd.append('--' + self.PARAM_IN_THREADN) idx_cmd.append(str(params[self.PARAM_IN_THREADN])) if params.get(self.PARAM_IN_FASTA_FILES, None) is not None: idx_cmd.append('--' + self.PARAM_IN_FASTA_FILES) for fasta_file in params[self.PARAM_IN_FASTA_FILES]: idx_cmd.append(fasta_file) # STEP 2: append the standard optional inputs if params.get('sjdbGTFfile', None) is not None: idx_cmd.append('--sjdbGTFfile') idx_cmd.append(params['sjdbGTFfile']) if (params.get('sjdbOverhang', None) is not None and params['sjdbOverhang'] > 0): idx_cmd.append('--sjdbOverhang') idx_cmd.append(str(params['sjdbOverhang'])) #print ('STAR indexing CMD:') #print ' '.join(idx_cmd) return idx_cmd def _construct_mapping_cmd(self, params): if params.get(self.PARAM_IN_STARMODE, None) is None: params[self.PARAM_IN_STARMODE] = 'alignReads' # STEP 1: set the working folder housing the STAR output results as well as the reads info star_out_dir = '' if params.get('align_output', None) is None: star_out_dir = self.scratch else: star_out_dir = params['align_output'] # STEP 2: construct the command for running STAR mapping mp_cmd = [self.STAR_BIN] mp_cmd.append('--genomeDir') mp_cmd.append(params[self.STAR_IDX_DIR]) mp_cmd.append('--' + self.PARAM_IN_STARMODE) mp_cmd.append(params[self.PARAM_IN_STARMODE]) mp_cmd.append('--' + self.PARAM_IN_THREADN) mp_cmd.append(str(params[self.PARAM_IN_THREADN])) if params.get(self.PARAM_IN_READS_FILES, None) is not None: #print('Input reads files:\n' + pformat(params[self.PARAM_IN_READS_FILES])) mp_cmd.append('--' + self.PARAM_IN_READS_FILES) for reads_file in params[self.PARAM_IN_READS_FILES]: mp_cmd.append(reads_file) readName, readsExtension = os.path.splitext(reads_file) #print ('Reads file name-- {}/extension-- {}:'.format(readName, readsExtension)) if readsExtension == '.gz': mp_cmd.append('--readFilesCommand') mp_cmd.append('gunzip') mp_cmd.append('-c') if readsExtension == '.bz2': mp_cmd.append('--readFilesCommand') mp_cmd.append('bunzip2') mp_cmd.append('-c') # STEP 3: appending the advanced optional inputs mp_cmd.append('--' + self.PARAM_IN_OUTFILE_PREFIX) mp_cmd.append(os.path.join(star_out_dir, params[self.PARAM_IN_OUTFILE_PREFIX])) if params.get('sjdbGTFfile', None) is not None: mp_cmd.append('--sjdbGTFfile') mp_cmd.append(params['sjdbGTFfile']) if (params.get('sjdbOverhang', None) is not None and params['sjdbOverhang'] > 0): mp_cmd.append('--sjdbOverhang') mp_cmd.append(str(params['sjdbOverhang'])) if (params.get('outFilterType', None) is not None and isinstance(params['outFilterType'], str)): mp_cmd.append('--outFilterType') mp_cmd.append(params['outFilterType']) if (params.get('outFilterMultimapNmax', None) is not None and isinstance(params['outFilterMultimapNmax'], int) and params['outFilterMultimapNmax'] >= 0): mp_cmd.append('--outFilterMultimapNmax') mp_cmd.append(str(params['outFilterMultimapNmax'])) #output sorted file:Aligned.sortedByCoord.out.bam #allowed values of --outSAMtype are BAM Unsorted or SortedByCoordinate or both if params.get('outSAMtype', None) is not None: mp_cmd.append('--outSAMtype') mp_cmd.append(params['outSAMtype']) if params.get('outSAMtype', None) == 'BAM': mp_cmd.append('SortedByCoordinate') # 'It is recommended to remove the non-canonical junctions for Cnks runs using # --outFilterIntronMotifs RemoveNoncanonical' if params.get('outFilterIntronMotifs', None) is not None: mp_cmd.append('--outFilterIntronMotifs') mp_cmd.append('RemoveNoncanonical') if (params.get('outSAMattrIHstart', None) is not None and isinstance(params['outSAMattrIHstart'], int) and params['outSAMattrIHstart'] >= 0): mp_cmd.append('--outSAMattrIHstart') mp_cmd.append(str(params['outSAMattrIHstart'])) if (params.get('outSAMstrandField', None) is not None and isinstance(params['outSAMstrandField'], str)): mp_cmd.append('--outSAMstrandField') mp_cmd.append(params['outSAMstrandField']) quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"] if (params.get('quantMode', None) is not None and params.get('quantMode', None) in quant_modes): mp_cmd.append('--quantMode') if params['quantMode'] == "Both": mp_cmd.append("TranscriptomeSAM") mp_cmd.append("GeneCounts") else: mp_cmd.append(params['quantMode']) if (params.get('alignSJoverhangMin', None) is not None and isinstance(params['alignSJoverhangMin'], int) and params['alignSJoverhangMin'] > 0): mp_cmd.append('--alignSJoverhangMin') mp_cmd.append(str(params['alignSJoverhangMin'])) if (params.get('alignSJDBoverhangMin', None) is not None and isinstance(params['alignSJDBoverhangMin'], int) and params['alignSJDBoverhangMin'] > 0): mp_cmd.append('--alignSJDBoverhangMin') mp_cmd.append(str(params['alignSJDBoverhangMin'])) if (params.get('outFilterMismatchNmax', None) is not None and isinstance(params['outFilterMismatchNmax'], int) and params['outFilterMismatchNmax'] > 0): mp_cmd.append('--outFilterMismatchNmax') mp_cmd.append(str(params['outFilterMismatchNmax'])) if (params.get('alignIntronMin', None) is not None and isinstance(params['alignIntronMin'], int) and params['alignIntronMin'] > 0): mp_cmd.append('--alignIntronMin') mp_cmd.append(str(params['alignIntronMin'])) if (params.get('alignIntronMax', None) is not None and isinstance(params['alignIntronMax'], int) and params['alignIntronMax'] >= 0): mp_cmd.append('--alignIntronMax') mp_cmd.append(str(params['alignIntronMax'])) if (params.get('alignMatesGapMax', None) is not None and isinstance(params['alignMatesGapMax'], int) and params['alignMatesGapMax'] >= 0): mp_cmd.append('--alignMatesGapMax') mp_cmd.append(str(params['alignMatesGapMax'])) #print ('STAR mapping CMD:') #print ' '.join(mp_cmd) return mp_cmd def _exec_indexing(self, params): log('Running STAR index generating with params:\n' + pformat(params)) idx_cmd = self._construct_indexing_cmd(params) exitCode = self.prog_runner.run(idx_cmd, self.scratch) return exitCode def _exec_mapping(self, params): log('Running STAR mapping with params:\n' + pformat(params)) mp_cmd = self._construct_mapping_cmd(params) exitCode = self.prog_runner.run(mp_cmd, self.scratch) return exitCode def _exec_star_pipeline(self, params, rds_files, rds_name, idx_dir, out_dir): # build the parameters params_idx = self._get_indexing_params(params, idx_dir) params_mp = self._get_mapping_params(params, rds_files, rds_name, idx_dir, out_dir) # execute indexing and then mapping retVal = {} try: if params[self.PARAM_IN_STARMODE]=='genomeGenerate': ret = self._exec_indexing(params_idx) else: ret = 0 while( ret != 0 ): time.sleep(1) except ValueError as eidx: log('STAR genome indexing raised error:\n') pprint(eidx) else:#no exception raised by genome indexing and STAR returns 0, then run mapping params_mp[self.PARAM_IN_STARMODE] = 'alignReads' try: ret = self._exec_mapping(params_mp) while( ret != 0 ): time.sleep(1) except ValueError as emp: log('STAR mapping raised error:\n') pprint(emp) else:#no exception raised by STAR mapping and STAR returns 0, then move to saving and reporting ret = {'star_idx': star_idx, 'star_output': params_mp.get('align_output')} return ret def upload_STARalignment(self, input_params, reads_ref, reads_info, output_bam_file): """ Uploads the alignment file + metadata. Returns the STAR alignment reference. """ aligner_opts = dict() for k in input_params: aligner_opts[k] = str(input_params[k]) pprint(reads_info) alignment_name = reads_ref['alignment_output_name'] align_upload_params = { "destination_ref": "{}/{}".format(input_params[self.PARAM_IN_WS], alignment_name), "file_path": output_bam_file, "assembly_or_genome_ref": input_params[self.PARAM_IN_GENOME], "read_library_ref": reads_info['object_ref'], "library_type": reads_info['style'], "condition": reads_info['condition'], "aligned_using": 'STAR', "aligner_version":self.STAR_VERSION, "aligner_opts": aligner_opts } pprint(align_upload_params) ra_util = ReadsAlignmentUtils(self.callback_url, service_ver='beta') rau_upload_ret = ra_util.upload_alignment(align_upload_params) alignment_ref = rau_upload_ret["obj_ref"] print("STAR alignment uploaded as object {}".format(alignment_ref)) return rau_upload_ret def generate_report_for_single_run(self, run_output_info, params): input_ref = run_output_info['upload_results']['obj_ref'] index_dir = run_output_info['index_dir'] output_dir = run_output_info['output_dir'] output_files = self._generate_output_file_list(index_dir, output_dir) # first run qualimap qualimap_report = self.qualimap.run_bamqc({'input_ref': input_ref}) qc_result_zip_info = qualimap_report['qc_result_zip_info'] # create report report_text = 'Ran on a single reads library.\n\n' alignment_info = self.get_obj_infos(input_ref)[0] report_text = 'Created ReadsAlignment: ' + str(alignment_info[1]) + '\n' report_text += ' ' + input_ref + '\n' kbr = KBaseReport(self.callback_url) report_info = kbr.create_extended_report({'message': report_text, 'file_links': output_files, 'objects_created': [{'ref': input_ref, 'description': 'ReadsAlignment'}], 'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4()), 'direct_html_link_index': 0, 'html_links': [{'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name']}], 'html_window_height': 366, 'workspace_name': params['output_workspace'] }) return report_info #{'report_name': report_info['name'], 'report_ref': report_info['ref']} def _get_reads_info(self, reads, readsSet_ref): ''' _get_reads_info:fetches the detailed info for each reads with ref in list reads_refs return an object of the following structure: { "style": "paired", "single", or "interleaved", "file_fwd": path_to_file, "name": name of the reads, "file_rev": path_to_file, only if paired end, "object_ref": reads reference for downstream convenience, "condition": the condition for the reads. } ''' try: print("Fetching FASTA file from reads reference {}".format(reads['ref'])) ret_reads_info = fetch_reads_from_reference(reads['ref'], self.callback_url) except ValueError: print("Incorrect object type for fetching a FASTA file!") raise if ret_reads_info.get("file_fwd", None) is None: raise RuntimeError("FASTA file fetched from reads {} doesn't seem to exist!".format(reads['ref'])) else: if reads.get('condition', None) is not None: ret_reads_info['condition'] = reads['condition'] else: ret_reads_info['condition'] = 'unspecified' if reads.get('object_ref', None) != readsSet_ref: ret_reads_info[self.PARAM_IN_READS] = readsSet_ref return ret_reads_info def _get_genome_fasta(self, gnm_ref): genome_fasta_files = list() if gnm_ref is not None: try: print("Fetching FASTA file from object {}".format(gnm_ref)) genome_fasta_file = fetch_fasta_from_object(gnm_ref, self.workspace_url, self.callback_url) print("Done fetching FASTA file! Path = {}".format(genome_fasta_file.get("path", None))) except ValueError: print("Incorrect object type for fetching a FASTA file!") raise if genome_fasta_file.get("path", None) is None: raise RuntimeError("FASTA file fetched from object {} doesn't seem exist!".format(gnm_ref)) else: genome_fasta_files.append(genome_fasta_file["path"]) return genome_fasta_files def convert_params(self, validated_params): """ Convert input parameters with KBase ref format into STAR parameters, and add the advanced options. """ params = copy.deepcopy(validated_params) params['runMode'] = 'genomeGenerate' if validated_params.get('create_report', None) is not None: params['create_report'] = validated_params['create_report'] if validated_params.get('concurrent_local_tasks', None) is not None: params['concurrent_local_tasks'] = validated_params['concurrent_local_tasks'] if validated_params.get('concurrent_njsw_tasks', None) is not None: params['concurrent_njsw_tasks'] = validated_params['concurrent_njsw_tasks'] if validated_params.get('alignmentset_suffix', None) is not None: params['alignmentset_suffix'] = validated_params['alignmentset_suffix'] # Add advanced options from validated_params to params sjdbGTFfile = validated_params.get("sjdbGTFfile", None) if sjdbGTFfile is not None: params['sjdbGTFfile'] = sjdbGTFfile else: params['sjdbGTFfile'] = self._get_genome_gtf_file( params[self.PARAM_IN_GENOME], os.path.join(self.scratch, self.STAR_IDX_DIR)) if validated_params.get('sjdbOverhang', None) is not None : params['sjdbOverhang'] = validated_params['sjdbOverhang'] else: params['sjdbOverhang'] = 100 quant_modes = ["TranscriptomeSAM", "GeneCounts", "Both"] if (validated_params.get('quantMode', None) is not None and validated_params.get('quantMode', None) in quant_modes): params['quantMode'] = validated_params['quantMode'] else: params['quantMode'] = 'Both' return params def _get_indexing_params(self, params, star_idx_dir): params_idx = { 'runMode': 'genomeGenerate', 'runThreadN': params[self.PARAM_IN_THREADN], self.STAR_IDX_DIR: star_idx_dir, 'genomeFastaFiles': params[self.PARAM_IN_FASTA_FILES] } if params.get('sjdbGTFfile', None) is not None: params_idx['sjdbGTFfile'] = params['sjdbGTFfile'] if params.get('sjdbOverhang', None) is not None : params_idx['sjdbOverhang'] = params['sjdbOverhang'] return params_idx def _get_mapping_params(self, params, rds_files, rds_name, idx_dir, out_dir): ''' build the mapping parameters''' aligndir = out_dir if rds_name: aligndir = os.path.join(out_dir, rds_name) self._mkdir_p(aligndir) #print '**********STAR output directory created:{}'.format(aligndir) params_mp = copy.deepcopy(params) params_mp['runMode'] = 'alignReads' params_mp['readFilesIn'] = rds_files params_mp[self.STAR_IDX_DIR] = idx_dir params_mp['align_output'] = aligndir return params_mp def determine_input_info(self, validated_params): ''' get info on the readsset_ref object and determine if we run once or run on a set input info provides information on the input and tells us if we should run as a single_library or as a set: input_info = {'run_mode': '', 'info': [..], 'ref': '55/1/2'} ''' info = self.get_obj_infos(validated_params[self.PARAM_IN_READS])[0] obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseAssembly.PairedEndLibrary', 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary']: return {'run_mode': 'single_library', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} if obj_type == 'KBaseRNASeq.RNASeqSampleSet': return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} if obj_type == 'KBaseSets.ReadsSet': return {'run_mode': 'sample_set', 'info': info, 'ref': validated_params[self.PARAM_IN_READS]} raise ValueError('Object type of readsset_ref is not valid, was: ' + str(obj_type)) def determine_unique_reads_names(self, validated_params): infos = self.get_obj_infos(validated_params[self.PARAM_IN_READS]) return get_unique_names(infos) def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_name_from_obj_info(self, info): return info[1] def get_obj_infos(self, ref): return self.ws_client.get_object_info3({'objects': [{'ref': ref}]})['infos'] def get_object_names(self, ref_list): """ From a list of workspace references, returns a mapping from ref -> name of the object. """ obj_ids = list() for ref in ref_list: obj_ids.append({"ref": ref}) info = self.ws_client.get_object_info3({"objects": obj_ids}) name_map = dict() # we already have the refs as passed previously, so use those for mapping, as they're in # the same order as what's returned. for i in range(len(info["infos"])): name_map[ref_list[i]] = info["infos"][i][1] return name_map def _mkdir_p(self, dir): """ _mkdir_p: make directory for given path """ log('Creating a new dir: ' + dir) if not dir: return if not os.path.exists(dir): os.makedirs(dir) else: log('{} has existed, so skip creating.'.format(dir)) def create_star_dirs(self, star_home): '''creating the directories for STAR''' # the index directory idxdir = os.path.join(star_home, self.STAR_IDX_DIR) self._mkdir_p(idxdir) # the output directory outdir = os.path.join(star_home, self.STAR_OUT_DIR) self._mkdir_p(outdir) return (idxdir, outdir) def _get_reads_refs_from_setref(self, params): readsSet_ref = params[self.PARAM_IN_READS] reads_refs = list() try: #print("Fetching reads ref(s) from sample/reads set ref {}".format(readsSet_ref)) reads_refs = fetch_reads_refs_from_sampleset( readsSet_ref, self.workspace_url, self.callback_url, params) #print("\nDone fetching reads ref(s) from readsSet {}--\nDetails:\n".format(readsSet_ref)) #pprint(reads_refs) except ValueError: print("Incorrect object type for fetching reads ref(s)!") raise return reads_refs def _generate_output_file_list(self, idx_dir, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) star_index = os.path.join(output_directory, 'star_index.zip') star_output = os.path.join(output_directory, 'star_output.zip') self.zip_folder(idx_dir, star_index) self.zip_folder(out_dir, star_output) #star_index = self.zip_folder_withDFU(idx_dir, 'star_index') #star_output = self.zip_folder_withDFU(out_dir, 'star_output') output_files.append({'path': star_index, 'name': os.path.basename(star_index), 'label': os.path.basename(star_index), 'description': 'Index file(s) generated by STAR'}) output_files.append({'path': star_output, 'name': os.path.basename(star_output), 'label': os.path.basename(star_output), 'description': 'Output file(s) generated by STAR'}) return output_files def zip_folder_withDFU(self, folder_path, output_name): """Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders will be included in the archive as well. """ output_path = self.dfu.pack_file( {'file_path': folder_path + '/' + output_name, 'pack': 'zip'})['file_path'] print "{} created successfully.".format(output_path) #with zipfile.ZipFile(output_path, "r") as f: #print 'Checking the zipped file......\n' #for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size #for fn in f.namelist(): #print fn return output_path def zip_folder(self, folder_path, output_path): """Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): # Include all subfolders, including empty ones. #for folder_name in folders: # absolute_path = os.path.join(root, folder_name) # relative_path = os.path.join(os.path.basename(root), folder_name) # print "Adding {} to archive.".format(absolute_path) # ziph.write(absolute_path, relative_path) for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) #print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print "{} created successfully.".format(output_path) #with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _generate_html_report(self, out_dir, obj_ref): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') star_obj = self.ws_client.get_objects2({'objects': [{'ref': obj_ref}]})['data'][0] star_obj_info = star_obj['info'] star_obj_data = star_obj['data'] star_obj_type = star_obj_info[2] Overview_Content = '' if re.match('KBaseRNASeq.RNASeqAlignment-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated Alignment Object</th>' Overview_Content += '<th></th></tr>' Overview_Content += '<tr><th>Alignment Name</th><th>Condition</th></tr>' Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1],obj_ref) Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition']) Overview_Content += '</table>' elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d.\d', star_obj_type) or re.match('KBaseSets.ReadsAlignmentSet-\d.\d', star_obj_type) or re.match('KBaseSet.RNASeqAlignmentSet-\d.\d', star_obj_type)): Overview_Content += '<br/><table><tr><th>Generated AlignmentSet Object</th></tr>' Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1],obj_ref) Overview_Content += '</td></tr></table>' Overview_Content += '<p><br/></p>' Overview_Content += '<table><tr><th>Generated Alignment Objects</th>' Overview_Content += '<th></th></tr>' Overview_Content += self._fill_html_trs('Alignment Name', star_obj_data) Overview_Content += '</table>' elif re.match('KBaseRNASeq.RNASeqExpression-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated Expression Object</th>' Overview_Content += '<th></th></tr>' Overview_Content += '<tr><th>Expression Name</th><th>Condition</th></tr>' Overview_Content += '<tr><td>{} ({})</td>'.format(star_obj_info[1], obj_ref) Overview_Content += '<td>{}</td></tr>'.format(star_obj_data['condition']) Overview_Content += '</table>' elif re.match('KBaseSets.ExpressionSet-\d.\d', star_obj_type): Overview_Content += '<br/><table><tr><th>Generated ExpressionSet Object</th></tr>' Overview_Content += '<tr><td>{} ({})'.format(star_obj_info[1], obj_ref) Overview_Content += '</td></tr></table>' Overview_Content += '<p><br/></p>' Overview_Content += '<table><tr><th>Generated Expression Objects</th>' Overview_Content += '<th></th></tr>' Overview_Content += self._fill_html_trs('Expression Name', star_obj_data) Overview_Content += '</table>' with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', Overview_Content) result_file.write(report_template) html_report.append({'path': result_file_path, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for STAR'}) return html_report def _fill_html_trs(self, col_caption, obj_data): ''' _fill_html_trs: simple creates an html string that has rows (tr) of td for a table ''' tr_html_str = '<tr><th>{}</th><th>Condition</th></tr>'.format(col_caption) for item in obj_data['items']: item_obj = self.ws_client.get_objects2({'objects':[{'ref': item['ref']}]})['data'][0] item_obj_info = item_obj['info'] item_obj_data = item_obj['data'] obj_name = item_obj_info[1] tr_html_str += '<tr><td>{} ({})</td>'.format(obj_name, item['ref']) tr_html_str += '<td>{}</td></tr>'.format(item_obj_data['condition']) return tr_html_str def _generate_star_report(self, obj_ref, report_text, html_links, workspace_name, index_dir, output_dir): """ _generate_star_report: generate summary report """ log('creating STAR report') output_files = self._generate_output_file_list(index_dir, output_dir) output_html_files = self._generate_html_report(output_dir, obj_ref) output_html_files += html_links star_obj = self.ws_client.get_objects2({'objects':[{'ref': obj_ref}]})['data'][0] star_obj_info = star_obj['info'] star_obj_data = star_obj['data'] star_obj_type = star_obj_info[2] if re.match('KBaseRNASeq.RNASeqAlignment-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'RNASeqAlignment generated by STAR'}] elif (re.match('KBaseRNASeq.RNASeqAlignmentSet-\d+.\d+', star_obj_type) or re.match('KBaseSets.ReadsAlignmentSet-\d+.\d+', star_obj_type) or re.match('KBaseSet.RNASeqAlignmentSet-\d+.\d+', star_obj_type)): objects_created = [{'ref': obj_ref, 'description': '{} generated by STAR'.format(re.sub(r"-\d+.\d+", "",star_obj_type))}] items = star_obj_data['items'] for item in items: objects_created.append({'ref': item['ref'], 'description': 'Alignment generated by STAR'}) elif re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'Expression generated by STAR'}] elif re.match('KBaseSets.ExpressionSet-\d+.\d+', star_obj_type): objects_created = [{'ref': obj_ref, 'description': 'ExpressionSet generated by STAR'}] items = star_obj_data['items'] for item in items: objects_created.append({'ref': item['ref'], 'description': 'Expression generated by STAR'}) report_params = {'message': report_text, 'workspace_name': workspace_name, 'file_links': output_files, 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 366, 'report_object_name': 'kb_STAR_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) report_output = kbase_report_client.create_extended_report(report_params) return report_output def upload_alignment_set(self, alignment_items, alignmentset_name, ws_name): """ Compiles and saves a set of alignment references (+ other stuff) into a KBaseRNASeq.RNASeqAlignmentSet. Returns the reference to the new alignment set. alignment_items: [{ "ref": alignment_ref, "label": condition label. }] """ print("Uploading completed alignment set") alignment_set = { "description": "Alignments using STAR, v.{}".format(self.STAR_VERSION), "items": alignment_items } set_info = self.set_api_client.save_reads_alignment_set_v1({ "workspace": ws_name, "output_object_name": alignmentset_name, "data": alignment_set }) return set_info
def prepare_data(cls): workspace_id = cls.dfu.ws_name_to_id(cls.wsName) # upload genome object genbank_file_name = 'minimal.gbff' genbank_file_path = os.path.join(cls.scratch, genbank_file_name) shutil.copy(os.path.join('data', genbank_file_name), genbank_file_path) genome_object_name = 'test_Genome' cls.genome_ref = cls.gfu.genbank_to_genome({ 'file': { 'path': genbank_file_path }, 'workspace_name': cls.wsName, 'genome_name': genome_object_name })['genome_ref'] print('TEST genome_ref=' + cls.genome_ref) # upload assembly object file_name = 'test.fna' fasta_path = os.path.join(cls.scratch, file_name) shutil.copy(os.path.join('data', file_name), fasta_path) assembly_name = 'test_assembly' cls.assembly_ref = cls.au.save_assembly_from_fasta({ 'file': { 'path': fasta_path }, 'workspace_name': cls.wsName, 'assembly_name': assembly_name }) print('TEST assembly_ref=' + cls.assembly_ref) # upload reads object reads_file_name = 'Sample1.fastq' reads_file_path = os.path.join(cls.scratch, reads_file_name) shutil.copy(os.path.join('data', reads_file_name), reads_file_path) reads_object_name_1 = 'test_Reads_1' cls.reads_ref_1 = cls.ru.upload_reads({ 'fwd_file': reads_file_path, 'wsname': cls.wsName, 'sequencing_tech': 'Unknown', 'interleaved': 0, 'name': reads_object_name_1 })['obj_ref'] print('TEST reads_ref_1=' + cls.reads_ref_1) reads_object_name_2 = 'test_Reads_2' cls.reads_ref_2 = cls.ru.upload_reads({ 'fwd_file': reads_file_path, 'wsname': cls.wsName, 'sequencing_tech': 'Unknown', 'interleaved': 0, 'name': reads_object_name_2 })['obj_ref'] print('TEST reads_ref_2=' + cls.reads_ref_2) # upload alignment object alignment_file_name = 'accepted_hits.bam' alignment_file_path = os.path.join(cls.scratch, alignment_file_name) shutil.copy(os.path.join('data', alignment_file_name), alignment_file_path) alignment_object_name_1 = 'test_Alignment_1' cls.condition_1 = 'test_condition_1' cls.alignment_ref_1 = cls.rau.upload_alignment({ 'file_path': alignment_file_path, 'destination_ref': cls.wsName + '/' + alignment_object_name_1, 'read_library_ref': cls.reads_ref_1, 'condition': cls.condition_1, 'library_type': 'single_end', 'assembly_or_genome_ref': cls.genome_ref })['obj_ref'] print('TEST alignment_ref_1=' + cls.alignment_ref_1) alignment_object_name_2 = 'test_Alignment_2' cls.condition_2 = 'test_condition_2' cls.alignment_ref_2 = cls.rau.upload_alignment({ 'file_path': alignment_file_path, 'destination_ref': cls.wsName + '/' + alignment_object_name_2, 'read_library_ref': cls.reads_ref_2, 'condition': cls.condition_2, 'library_type': 'single_end', 'assembly_or_genome_ref': cls.genome_ref })['obj_ref'] print('TEST alignment_ref_2=' + cls.alignment_ref_2) alignment_object_name_3 = 'test_Alignment_3' cls.condition_3 = 'test_condition_3' cls.alignment_ref_3 = cls.rau.upload_alignment({ 'file_path': alignment_file_path, 'destination_ref': cls.wsName + '/' + alignment_object_name_3, 'read_library_ref': cls.reads_ref_2, 'condition': cls.condition_3, 'library_type': 'single_end', 'assembly_or_genome_ref': cls.assembly_ref })['obj_ref'] print('TEST alignment_ref_3=' + cls.alignment_ref_3) # upload sample_set object sample_set_object_name = 'test_Sample_Set' sample_set_data = { 'sampleset_id': sample_set_object_name, 'sample_ids': [cls.reads_ref_1, cls.reads_ref_2], 'sampleset_desc': 'test sampleset object', 'Library_type': 'SingleEnd', 'condition': [cls.condition_1, cls.condition_2], 'domain': 'Unknown', 'num_samples': 2, 'platform': 'Unknown' } save_object_params = { 'id': workspace_id, 'objects': [{ 'type': 'KBaseRNASeq.RNASeqSampleSet', 'data': sample_set_data, 'name': sample_set_object_name }] } dfu_oi = cls.dfu.save_objects(save_object_params)[0] cls.sample_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str( dfu_oi[4]) print('TEST sample_set_ref=' + cls.sample_set_ref) # upload alignment_set object object_type = 'KBaseRNASeq.RNASeqAlignmentSet' alignment_set_object_name = 'test_Alignment_Set' alignment_set_data = { 'genome_id': cls.genome_ref, 'read_sample_ids': [reads_object_name_1, reads_object_name_2], 'mapped_rnaseq_alignments': [{ reads_object_name_1: alignment_object_name_1 }, { reads_object_name_2: alignment_object_name_2 }], 'mapped_alignments_ids': [{ reads_object_name_1: cls.alignment_ref_1 }, { reads_object_name_2: cls.alignment_ref_2 }], 'sample_alignments': [cls.alignment_ref_1, cls.alignment_ref_2], 'sampleset_id': cls.sample_set_ref } save_object_params = { 'id': workspace_id, 'objects': [{ 'type': object_type, 'data': alignment_set_data, 'name': alignment_set_object_name }] } dfu_oi = cls.dfu.save_objects(save_object_params)[0] cls.old_alignment_set_ref = str(dfu_oi[6]) + '/' + str( dfu_oi[0]) + '/' + str(dfu_oi[4]) print('TEST (legacy) KBaseRNASeq.alignment_set_ref=' + cls.old_alignment_set_ref) # Save the alignment set items = [{ 'ref': cls.alignment_ref_1, 'label': 'c1' }, { 'ref': cls.alignment_ref_2, 'label': 'c2' }] alignment_set_data = {'description': '', 'items': items} alignment_set_save_params = { 'data': alignment_set_data, 'workspace': cls.wsName, 'output_object_name': 'MyReadsAlignmentSet' } set_api = SetAPI(cls.srv_wiz_url) save_result = set_api.save_reads_alignment_set_v1( alignment_set_save_params) cls.new_alignment_set_ref = save_result['set_ref'] print('TEST KBaseSet.alignment_set_ref=') print(cls.new_alignment_set_ref)
def exec_megahit(self, ctx, params): """ :param params: instance of type "ExecMegaHitParams" (exec_megahit() Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as Input Creates Assembly object(s) as output. Will eventually also create AssemblySet object if input is a ReadsSet and not running a combined assembly Other vars same as run_megahit()) -> structure: parameter "workspace_name" of String, parameter "input_reads_ref" of String, parameter "output_contigset_name" of String, parameter "combined_assembly_flag" of Long, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_len" of Long :returns: instance of type "ExecMegaHitOutput" -> structure: parameter "report_text" of String, parameter "output_contigset_ref" of list of String """ # ctx is the context object # return variables are: output #BEGIN exec_megahit console = [] self.log(console, 'Running exec_megahit() with params=') self.log(console, "\n" + pformat(params)) #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' ### STEP 0: init token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token ### STEP 1: basic parameter checks + parsing required_params = [ 'workspace_name', 'input_reads_ref', 'output_contigset_name' ] for required_param in required_params: if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 2: determine if input is a ReadsLibrary or ReadsSet input_reads_ref = params['input_reads_ref'] input_reads_name = None try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_reads_ref }]})[0] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_info[TYPE_I]) # remove trailing version input_reads_name = input_reads_obj_info[NAME_I] except Exception as e: raise ValueError('Unable to get reads object from workspace: (' + input_reads_ref + ')' + str(e)) accepted_input_types = [ "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary" ] if input_reads_obj_type not in accepted_input_types: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) if input_reads_obj_type == "KBaseSets.ReadsSet": required_param = 'combined_assembly_flag' if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 3: get the list of library references if input_reads_obj_type == "KBaseFile.PairedEndLibrary": readsSet_ref_list = [input_reads_ref] readsSet_names_list = [input_reads_name] elif input_reads_obj_type == "KBaseSets.ReadsSet": readsSet_ref_list = [] readsSet_names_list = [] try: setAPI_Client = SetAPI( url=self.serviceWizardURL, token=ctx['token']) # for dynamic service #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # SDK local method except Exception as e: raise ValueError( "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '" + self.serviceWizardURL + "' token: '" + ctx['token'] + "'" + str(e)) #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e)) try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': input_reads_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_reads_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) else: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine if input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: self.log( console, "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES" ) # make dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) input_dir = os.path.join(self.scratch, 'input.' + str(timestamp)) if self.mac_mode: # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there input_dir = os.path.join(self.host_scratch, 'input.' + str(timestamp)) if not os.path.exists(input_dir): os.makedirs(input_dir) # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # start combined file read_buf_size = 65536 write_buf_size = 65536 combined_input_fwd_path = os.path.join(input_dir, 'input_reads_fwd.fastq') combined_input_rev_path = os.path.join(input_dir, 'input_reads_rev.fastq') combined_input_fwd_handle = open(combined_input_fwd_path, 'w', write_buf_size) combined_input_rev_handle = open(combined_input_rev_path, 'w', write_buf_size) # add libraries, one at a time for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] # append fwd self.log( console, "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) this_input_path = this_input_fwd_path cat_file_handle = combined_input_fwd_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file # append rev this_input_path = this_input_rev_path cat_file_handle = combined_input_rev_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file combined_input_fwd_handle.close() combined_input_rev_handle.close() ### STEP 5: finally run MegaHit_Sets exec_megahit_single_library_params = params output_assemblyset_contigset_paths = [] output_contigset_path = None # PairedEndLibrary if input_reads_obj_type == "KBaseFile.PairedEndLibrary": self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: " + str(input_reads_ref)) try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + input_reads_ref + ")\n" + str(e)) input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][ 'fwd'] input_rev_path = readsLibrary['files'][input_reads_ref]['files'][ 'rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet combined (already downloaded and combined fastqs) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: input_fwd_path = combined_input_fwd_path input_rev_path = combined_input_rev_path exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet uncombined (still have to download) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] == 0: # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # get libraries, one at a time, and run MegaHit_Sets output_assemblyset_contigset_paths = [] for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = this_input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = this_input_rev_path # the key line this_output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append( this_output_contigset_path) os.remove(this_input_fwd_path) # files can be really big os.remove(this_input_rev_path) # just in case we've confused ourselves else: raise ValueError("error in logic") ### STEP 6: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) output_contigset_refs = [] output_contigset_names = [] for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): if len(output_assemblyset_contigset_paths) == 1: assembly_name = params['output_contigset_name'] else: assembly_name = readsSet_names_list[i] + '-' + params[ 'output_contigset_name'] this_output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': this_output_contigset_path }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_name }) output_contigset_refs.append(this_output_data_ref) output_contigset_names.append(assembly_name) ### STEP 7: generate the report text # compute a simple contig length distribution for the report report = '' for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[ i] + "\n" report += "-------------------------------------------------------------\n" report += "\n" lengths = [] for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'): lengths.append(len(seq_record.seq)) report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + output_contigset_names[i] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' ### STEP 8: contruct the output to send back output = { 'report_text': report, 'output_contigset_refs': output_contigset_refs } #END exec_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method exec_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params): """ :param params: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Params" (KButil_Build_InSilico_Metagenomes_with_Grinder() ** ** Use Grinder to generate in silico shotgun metagenomes) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "input_refs" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "desc" of String, parameter "num_reads_per_lib" of Long, parameter "population_percs" of String, parameter "read_len_mean" of Long, parameter "read_len_stddev" of Double, parameter "pairs_flag" of Long, parameter "mate_orientation" of String, parameter "insert_len_mean" of Long, parameter "insert_len_stddev" of Double, parameter "mutation_dist" of String, parameter "mutation_ratio" of String, parameter "qual_good" of Long, parameter "qual_bad" of Long, parameter "len_bias_flag" of Long, parameter "random_seed" of Long :returns: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder #### STEP 0: basic init ## console = [] invalid_msgs = [] report_text = '' self.log(console, 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ') self.log(console, "\n" + pformat(params)) # Auth token = ctx['token'] headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # API Clients #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' wsClient = workspaceService(self.workspaceURL, token=token) readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI(url=self.serviceWizardURL, token=ctx['token']) # for dynamic service auClient = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) dfu = DFUClient(self.callbackURL) # param checks required_params = [ 'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib', 'population_percs', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'mate_orientation', 'insert_len_mean', 'insert_len_stddev', 'mutation_dist', 'mutation_ratio', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # cast to str unpredictable numerical params (mostly used in string context) numerical_params = [ 'num_reads_per_lib', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in numerical_params: if arg not in params or params[arg] == None or params[arg] == '': continue params[arg] = str(params[arg]) # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects'] = [] for input_ref in params['input_refs']: provenance[0]['input_ws_objects'].append(input_ref) # set the output paths timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) html_output_dir = os.path.join(output_dir, 'html') if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) #### STEP 1: Parse population_percs and write to file ## abundance_str = params['population_percs'].strip() abundance_file_path = os.path.join(output_dir, 'my_abundances.txt') abundance_config_num_libs = 0 abundance_config_num_libs_set = False grinder_genome_ids = [] header = [] out_buf = [] for row in abundance_str.split("\n"): cols = re.split(r'\s+', row) if cols[0].upper() == "GENOME": for col in cols: if col == '': continue header.append(col) continue grinder_genome_ids.append(cols[0]) self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'") # DEBUG out_row = [] for col in cols: if col == '': continue elif col == '%': continue elif col.endswith('%'): col = col.rstrip('%') out_row.append(col) out_buf.append("\t".join(out_row)) num_samples = len(out_row) - 1 # first col is genome id if not abundance_config_num_libs_set: abundance_config_num_libs_set = True abundance_config_num_libs = num_samples elif num_samples != abundance_config_num_libs: invalid_msgs.append( "inconsistent number of samples in population_percs input field" ) # data validation if abundance_config_num_libs == 0: invalid_msgs.append( "unable to find sample percentages in population_percs input field" ) sample_sums = [] for row_i, abund_row_str in enumerate(out_buf): abund_row = abund_row_str.split() for sample_i, abund in enumerate(abund_row[1:]): if row_i == 0: sample_sums.append(0) #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i)) # DEBUG sample_sums[sample_i] += float(abund) for sample_i, sample_sum in enumerate(sample_sums): if sample_sum < 99.5 or sample_sum > 100.5: self.log( invalid_msgs, "Sample: " + str(sample_i + 1) + " " + header[sample_i + 1] + " proportions is not summing to 100.0. Summing to: " + str(sample_sum)) if len(invalid_msgs) == 0: with open(abundance_file_path, 'w') as abundance_fh: for out_line in out_buf: abundance_fh.write(out_line + "\n") # DEBUG with open(abundance_file_path, 'r') as abundance_fh: for out_line in abundance_fh.readlines(): out_line = out_line.rstrip() self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'") #### STEP 2: get genome scaffold sequences ## if len(invalid_msgs) == 0: genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna') read_buf_size = 65536 write_buf_size = 65536 accepted_input_types = ["KBaseGenomes.Genome"] genome_refs = params['input_refs'] genome_obj_names = [] genome_sci_names = [] assembly_refs = [] for i, input_ref in enumerate(genome_refs): # genome obj info try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_ref }]})[0] input_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_obj_info[TYPE_I]) # remove trailing version genome_obj_names.append(input_obj_info[NAME_I]) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) if input_obj_type not in accepted_input_types: raise ValueError("Input object of type '" + input_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) # genome obj data try: genome_obj = wsClient.get_objects([{ 'ref': input_ref }])[0]['data'] genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + input_ref) # Get assembly_refs if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." self.log(console, msg) self.log(invalid_msgs, msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) self.log(console, msg) assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) self.log(console, msg) assembly_refs.append(genome_obj['contigset_ref']) # get fastas for scaffolds if len(invalid_msgs) == 0: contig_file_paths = [] for genome_i, input_ref in enumerate(genome_refs): contig_file = auClient.get_assembly_as_fasta({ 'ref': assembly_refs[genome_i] }).get('path') sys.stdout.flush() contig_file_path = dfu.unpack_file({'file_path': contig_file})['file_path'] contig_file_paths.append(contig_file_path) # reformat FASTA IDs for Grinder with open(genomes_src_db_file_path, 'w', write_buf_size) as genomes_src_db_fh: for genome_i, contig_file_path in enumerate(contig_file_paths): #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path) # DEBUG #contig_ids = [] with open(contig_file_path, 'r', read_buf_size) as contig_fh: genome_seq = '' contig_seq = '' contig_seqs = [] for contig_line in contig_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): #contig_id = contig_line.strip()[1:].split(' ')[0] #contig_ids.append(contig_id) #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n") if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' continue else: #genomes_src_db_fh.write(contig_line) contig_seq += contig_line if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' # write joined contigs to file genome_seq = "NNNNNNNNNN".join( contig_seqs ) # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins genome_seq = genome_seq.upper( ) # grinder might require upper case? genomes_src_db_fh.write(">" + grinder_genome_ids[genome_i] + "\n") genomes_src_db_fh.write(genome_seq + "\n") genome_seq = '' contig_seqs = [] # DEBUG #for contig_id in contig_ids: # self.log(console, "\tCONTIG_ID: "+contig_id) # DEBUG # DEBUG toggle = 0 with open(genomes_src_db_file_path, 'r', write_buf_size) as genomes_src_db_fh: for contig_line in genomes_src_db_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): self.log(console, 'GENOMES_SRC_DB: ' + contig_line) genome_id = contig_line[1:] toggle = 0 elif toggle == 0: #elif genome_id == 'G3': self.log( console, 'GENOMES_SRC_DB: ' + contig_line[0:50] + '...') toggle += 1 #### STEP 3: Run Grinder ## if len(invalid_msgs) == 0: cmd = [] cmd.append(self.GRINDER) # output cmd.append('-base_name') cmd.append(params['output_name']) cmd.append('-output_dir') cmd.append(output_dir) # contigs input cmd.append('-reference_file') cmd.append(genomes_src_db_file_path) # abundances cmd.append('-abundance_file') cmd.append(abundance_file_path) # library size cmd.append('-total_reads') cmd.append(str(params['num_reads_per_lib'])) # num libraries (overridden by abundance file?) cmd.append('-num_libraries') cmd.append(str(abundance_config_num_libs)) # read and insert lens cmd.append('-read_dist') cmd.append(str(params['read_len_mean'])) cmd.append('normal') cmd.append(str(params['read_len_stddev'])) if str(params['pairs_flag']) == '1': cmd.append('-insert_dist') cmd.append(str(params['insert_len_mean'])) cmd.append('normal') cmd.append(str(params['insert_len_stddev'])) # mate orientation cmd.append('-mate_orientation') cmd.append(params['mate_orientation']) # genome len bias cmd.append('-length_bias') cmd.append(str(params['len_bias_flag'])) # mutation model cmd.append('-mutation_dist') cmd.append(str(params['mutation_dist'])) cmd.append('-mutation_ratio') cmd.append(str(params['mutation_ratio'])) # qual scores cmd.append('-fastq_output') cmd.append('1') cmd.append('-qual_levels') cmd.append(str(params['qual_good'])) cmd.append(str(params['qual_bad'])) # skip contig joins cmd.append('-exclude_chars') cmd.append('NX') # explicitly request bidirectional cmd.append('-unidirectional') cmd.append('0') # random seed if 'random_seed' in params and params[ 'random_seed'] != None and params['random_seed'] != '': cmd.append('-random_seed') cmd.append(str(params['random_seed'])) # RUN cmd_str = " ".join(cmd) self.log(console, "===========================================") self.log(console, "RUNNING: " + cmd_str) self.log(console, "===========================================") cmdProcess = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) outputlines = [] while True: line = cmdProcess.stdout.readline() outputlines.append(line) if not line: break self.log(console, line.replace('\n', '')) cmdProcess.stdout.close() cmdProcess.wait() self.log(console, 'return code: ' + str(cmdProcess.returncode) + '\n') if cmdProcess.returncode != 0: raise ValueError('Error running kb_grinder, return code: ' + str(cmdProcess.returncode) + '\n') #report_text += "\n".join(outputlines) #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr # capture output for report and paths to out files report_text_buf = [] struct_file_paths = [] struct_file_names = [] fastq_file_paths = [] for out_line in outputlines: out_line = out_line.rstrip() if 'Community structure' in out_line: clean_line = out_line.lstrip() struct_file_path = re.split(r'\s+', clean_line)[3] struct_file_paths.append(struct_file_path) struct_file_names.append(struct_file_path.split('/')[-1]) self.log(console, "STRUCT_FILE_NAME: '" + struct_file_path.split('/')[-1]) # DEBUG elif 'FASTQ file' in out_line: clean_line = out_line.lstrip() fastq_file_paths.append(re.split(r'\s+', clean_line)[3]) else: report_text_buf.append(out_line) report_text += "\n".join(report_text_buf) #### STEP 4: Upload Read Libs and create reads set ## if len(invalid_msgs) == 0: lib_obj_refs = [] lib_obj_names = [] readsSet_items = [] for sample_i, fastq_file_path in enumerate(fastq_file_paths): if not os.path.isfile (fastq_file_path) \ or os.path.getsize (fastq_file_path) == 0: raise ValueError("empty read lib generated: " + fastq_file_path) else: # lib obj name if len(fastq_file_paths) == 0: output_obj_name = params['output_name'] else: if str(params['pairs_flag']) == '1': output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".PairedEndLib" else: output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".SingleEndLib" lib_obj_names.append(output_obj_name) # upload lib and get obj ref self.log( console, 'Uploading trimmed paired reads: ' + output_obj_name) sequencing_tech = 'artificial reads' if str(params['pairs_flag']) == '1': interleaved = 1 else: interleaved = 0 lib_obj_ref = readsUtils_Client.upload_reads({ 'wsname': str(params['workspace_name']), 'name': output_obj_name, 'fwd_file': fastq_file_path, 'interleaved': interleaved, 'sequencing_tech': sequencing_tech })['obj_ref'] lib_obj_refs.append(lib_obj_ref) os.remove(fastq_file_path) # free up disk # add to readsSet readsSet_items.append({ 'ref': lib_obj_ref, 'label': output_obj_name }) # create readsset readsSet_obj_ref = None if len(lib_obj_refs) > 1: readsSet_obj = { 'description': "Grinder Metagenome from " + " ".join(genome_obj_names), 'items': readsSet_items } readsSet_obj_name = params['output_name'] readsSet_obj_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['workspace_name'], 'output_object_name': readsSet_obj_name, 'data': readsSet_obj })['set_ref'] #### STEP 5: Build report ## reportName = 'kb_grinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], #'text_message': '', # or is it 'message'? 'message': '', # or is it 'text_message'? 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # message if len(invalid_msgs) > 0: report_text = "\n".join(invalid_msgs) reportObj['message'] = report_text if len(invalid_msgs) == 0: # objs if readsSet_obj_ref != None: reportObj['objects_created'].append({ 'ref': readsSet_obj_ref, 'desc': params['output_name'] + " ReadsSet" }) for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs): reportObj['objects_created'].append({ 'ref': lib_obj_ref, 'desc': lib_obj_names[lib_obj_i] }) # downloadable data for data_i, data_path in enumerate(struct_file_paths): try: upload_ret = dfu.file_to_shock({ 'file_path': data_path, #'pack': 'zip'}) 'make_handle': 0 }) except: raise ValueError('error uploading ' + data_path + ' file to shock') reportObj['file_links'].append({ 'shock_id': upload_ret['shock_id'], 'name': struct_file_names[data_i], 'label': struct_file_names[data_i] }) # html report """ try: html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir, 'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading html report to shock') reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': params['output_name']+' HTML' } ] """ # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END KButil_Build_InSilico_Metagenomes_with_Grinder # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' JAVA_MEM_DEFAULT_SIZE = '16G' LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024 # 20 GB TIMEOUT = 72 * 60 * 60 # 72 hours def _get_file_size(self, file_path): file_size = os.path.getsize(file_path) print('File size: {} -- {}'.format(file_size, file_path)) return file_size def _large_file(self, file_path): filename, file_extension = os.path.splitext(file_path) multiplier = 0 if file_extension == '.txt': total_file_size = 0 with open(file_path, 'r') as f: for line in f: bam_file_path = line.split('\t')[1] total_file_size += self._get_file_size(bam_file_path) print('Total file size: {}'.format(total_file_size)) multiplier = int(total_file_size) / int(self.LARGE_BAM_FILE_SIZE) else: multiplier = int(self._get_file_size(file_path)) / int( self.LARGE_BAM_FILE_SIZE) print('setting number of windows multiplier to: {}'.format(multiplier)) return multiplier def _timeout_handler(self, signum, frame): print('Signal handler called with signal', signum) raise ValueError('QualiMap takes too long') def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.gfu = GenomeFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info.get('mode') not in ['single', 'multi']: raise ValueError( 'Error in fetching the type to determine run settings.') run_error = False try: signal.signal(signal.SIGALRM, self._timeout_handler) signal.alarm(self.TIMEOUT) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) signal.alarm(0) except Exception: run_error = True workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) os.makedirs(workdir) with open(os.path.join(workdir, 'qualimapReport.html'), 'w') as report: report.write('<html><body><p></p></body></html>') package_info = self.package_output_folder( workdir, 'QualiMap_report', 'EMPTY HTML report directory for QualiMap BAM QC', 'qualimapReport.html') result = { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info, 'shock_id': None } error_msg = 'Running QualiMap returned an error:\n{}\n'.format( traceback.format_exc()) error_msg += 'Generating simple report instead\n' print(error_msg) if params['create_report']: result = self.create_report(result, params['output_workspace'], run_error, params['input_ref']) return result def create_report(self, result, output_workspace, run_error=None, input_ref=None): if run_error: objects_created = [] info = self.get_obj_info(input_ref) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: objects_created.append({ 'ref': input_ref, 'description': 'Alignment' }) if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: objects_created.append({ 'ref': input_ref, 'description': 'AlignmentSet' }) reads_alignment_info = self.get_alignments_from_set(input_ref) for alignment in reads_alignment_info: alignment_ref = alignment.get('ref') objects_created.append({ 'ref': alignment_ref, 'description': 'Alignment' }) report_info = self.kbr.create_extended_report({ 'message': ' ', 'objects_created': objects_created, 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def get_gtf_file(self, input_ref, set_op=False): print('Start fetching GFF file from genome') if set_op: set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) input_ref = set_data['data']['items'][0]['ref'] obj_data = self.dfu.get_objects({"object_refs": [input_ref]})['data'][0]['data'] genome_ref = obj_data.get('genome_id') if not genome_ref: raise ValueError( 'Alignment is not associated with a Genome object') result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4())) os.makedirs(result_directory) genome_gtf_file = self.gfu.genome_to_gff({ 'genome_ref': genome_ref, 'is_gtf': True, 'target_dir': result_directory })['file_path'] return genome_gtf_file def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) try: gtf_file = self.get_gtf_file(input_ref) except: gtf_file = '' workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) # always use large mem multiplier = self._large_file(bam_file_path) if multiplier: window_size = multiplier * 400 print('using larger window size: {} and Java memory: {}'.format( window_size, self.JAVA_MEM_DEFAULT_SIZE)) options.append( '-nw {}'.format(window_size)) # increase size of windows self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) try: gtf_file = self.get_gtf_file(input_ref, set_op=True) except: gtf_file = '' suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-c', '-outdir', workdir, '-outformat', 'html' ] if gtf_file: options += ['-gff', gtf_file] multiplier = self._large_file(input_file_path) if multiplier: window_size = multiplier * 400 print('using larger window size: {} and Java memory: {}'.format( window_size, self.JAVA_MEM_DEFAULT_SIZE)) options.append( '-nw {}'.format(window_size)) # increase size of windows options.append('--java-mem-size={}'.format( self.JAVA_MEM_DEFAULT_SIZE)) self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if (exitCode == 0): print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): ''' Simple utility for packaging a folder and saving to shock ''' output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
class DataStagingUtils(object): def __init__(self, config, ctx): self.ctx = ctx self.scratch = os.path.abspath(config['scratch']) self.ws_url = config['workspace-url'] self.serviceWizardURL = config['srv-wiz-url'] self.callbackURL = config['SDK_CALLBACK_URL'] if not os.path.exists(self.scratch): os.makedirs(self.scratch) self.SE_flag = 'SE' self.PE_flag = 'PE' SERVICE_VER = 'release' # readsUtils_Client try: self.readsUtils_Client = ReadsUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate readsUtils_Client with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI self.setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) def expand_input(self, input_refs): ''' Expand input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # expand any sets and build a non-redundant list of reads input objs ws = Workspace(self.ws_url) expanded_input = [] input_ref_seen = dict() SE_types = [ 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary' ] PE_types = [ 'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary' ] [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for input_ref in input_refs: input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] # ReadsSet if type_name in ['KBaseSets.ReadsSet']: try: input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: this_reads_ref = readsLibrary_obj['ref'] if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = readsLibrary_obj['info'][NAME_I] reads_item_type = readsLibrary_obj['info'][TYPE_I] reads_item_type = re.sub( '-[0-9]+\.[0-9]+$', "", reads_item_type) # remove trailing version if reads_item_type in PE_types: this_reads_type = self.PE_flag elif reads_item_type in SE_types: this_reads_type = self.SE_flag else: raise ValueError("Can't handle read item type '" + reads_item_type + "' obj_name: '" + this_reads_name + " in Set: '" + str(input_ref) + "'") expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # SingleEnd Library elif type_name in SE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.SE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # PairedEnd Library elif type_name in PE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.PE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) else: raise ValueError("Illegal type in input_refs: " + str(obj_name) + " (" + str(input_ref) + ") is of type: '" + str(type_name) + "'") return expanded_input def stage_input(self, input_item=None, subsample_percent=10, subsample_replicates=1, subsample_seed=1, fasta_file_extension='fastq'): ''' Stage input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet This method creates a directory in the scratch area with the set of Fasta/Fastq files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq') staged_input {"input_dir": '...'} ''' # init staged_input = dict() replicate_input = [] # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'input_reads_' + suffix) if not os.path.exists(input_dir): os.makedirs(input_dir) # # Download reads # # Paired End Lib if input_item['type'] == self.PE_flag: try: readsLibrary = self.readsUtils_Client.download_reads({ 'read_libraries': [input_item['ref']], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] input_rev_file_path = readsLibrary['files'][ input_item['ref']]['files']['rev'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) rev_filename = os.path.join( input_dir, input_item['name'] + '.rev.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) if input_rev_file_path != rev_filename: shutil.move(input_rev_file_path, rev_filename) input_item['fwd_file'] = fwd_filename input_item['rev_file'] = rev_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) if not os.path.isfile(rev_filename): raise ValueError('Error generating reads file ' + rev_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(rev_filename)) # Single End Lib elif input_item['type'] == self.SE_flag: try: readsLibrary = self.readsUtils_Client.download_reads( {'read_libraries': [input_item['ref']]}) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) input_item['fwd_file'] = fwd_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) else: raise ValueError("No type set for input library " + str(input_item['name']) + " (" + str(input_item['ref']) + ")") # # Subsample # if subsample_percent == 100: replicate_input = [input_item] else: replicate_input = self._randomly_subsample_reads( input_item, subsample_percent=subsample_percent, subsample_replicates=subsample_replicates, subsample_seed=subsample_seed) # free up disk os.remove(input_item['fwd_file']) if input_item['type'] == self.PE_flag: os.remove(input_item['rev_file']) # return input file info #staged_input['input_dir'] = input_dir #staged_input['folder_suffix'] = suffix staged_input['replicate_input'] = replicate_input return staged_input def _randomly_subsample_reads(self, input_item=None, subsample_percent=100, subsample_replicates=1, subsample_seed=1): replicate_files = [] split_num = subsample_replicates # for now can only do percentage instead of raw cnt of reads per subsample use_reads_num = False use_reads_perc = True reads_num = 0 # not used. subsample_percent used instead # init randomizer random.seed(subsample_seed) # Paired End # if input_item['type'] == self.PE_flag: print("SUBSAMPLING PE library " + input_item['name']) # DEBUG # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) input_rev_path = re.sub("\.fastq$", "", input_item['rev_file']) input_rev_path = re.sub("\.FASTQ$", "", input_rev_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" output_rev_paired_file_path_base = input_rev_path + "_rev_paired" # set up for file io total_paired_reads = 0 total_unpaired_fwd_reads = 0 total_unpaired_rev_reads = 0 total_paired_reads_by_set = [] fwd_ids = dict() paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 1000000 # read fwd file to get fwd ids # rec_cnt = 0 # DEBUG print("GETTING IDS") # DEBUG with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) fwd_ids[read_id] = True # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 # read reverse to determine paired print("DETERMINING PAIRED IDS") # DEBUG with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if fwd_ids[read_id]: paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL PAIRED READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # split fwd paired print("WRITING FWD SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 total_paired_reads_by_set[lib_i] += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_fwd_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " FWD recs processed") # split rev paired print("WRITING REV SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_rev_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_rev_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " REV recs processed") # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n" report += "TOTAL UNPAIRED FWD READS (discarded): " + str( total_unpaired_fwd_reads) + "\n" report += "TOTAL UNPAIRED REV READS (discarded): " + str( total_unpaired_rev_reads) + "\n" report += "\n" for lib_i in range(split_num): report += "PAIRED READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return # for replicate_i,replicate_item in enumerate(replicate_files): # replicate_input.append({'fwd_file': replicate_item['fwd_file'], # 'type': input_item['type'], # 'name': input_item['name']+"-"+str(replicate_i) # }) # if input_item['type'] == self.PE_flag: # replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file'] print("MAKING REPLICATE OBJECT") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0 \ or not os.path.isfile (output_rev_paired_file_path) \ or os.path.getsize (output_rev_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) # SingleEndLibrary # elif input_item['type'] == self.SE_flag: print("SUBSAMPLING SE library " + input_item['name']) # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" # get "paired" ids print("DETERMINING IDS") # DEBUG paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 100000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if read_id in paired_ids: raise ValueError("repeat read_id: " + read_id) paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # set up for file io total_paired_reads = 0 total_paired_reads_by_set = [] paired_buf_size = 1000000 # split reads print("WRITING SPLIT SINGLE END READS") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 recs_beep_n = 1000000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") total_paired_reads += 1 if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id rec_buf.append(line) # last rec if len(rec_buf) > 0: if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL READS: " + str(total_paired_reads) + "\n" for lib_i in range(split_num): report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return print("MAKING REPLICATE OBJECTS") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) else: raise ValueError("unknown ReadLibrary type:" + str(input_item['type']) + " for readslibrary: " + input_item['name']) return replicate_files def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1): ''' counts the number of non-header, non-whitespace characters in a FASTA file ''' seq_len = 0 with open(fasta_path, 'r', 0) as fasta_handle: for line in fasta_handle: line = line.strip() if line.startswith('>'): continue line = line.replace(' ', '') seq_len += len(line) if seq_len >= min_fasta_len: return True return False
class QualiMapRunner: QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap' def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url): self.scratch_dir = scratch_dir self.rau = ReadsAlignmentUtils(callback_url) self.kbr = KBaseReport(callback_url) self.dfu = DataFileUtil(callback_url) self.set_api = SetAPI(srv_wiz_url) self.ws = Workspace(workspace_url) self.valid_commands = ['bamqc', 'multi-bamqc'] def run_app(self, params): self.validate_params(params) print('Validated Params = ') pprint(params) run_info = self.get_run_info(params) if run_info['mode'] == 'single': result = self.run_bamqc(params['input_ref'], run_info['input_info']) elif run_info['mode'] == 'multi': result = self.run_multi_sample_qc(params['input_ref'], run_info['input_info']) else: raise ValueError( 'Error in fetching the type to determine run settings.') if params['create_report']: result = self.create_report(result, params['output_workspace']) return result def create_report(self, result, output_workspace): qc_result_zip_info = result['qc_result_zip_info'] report_info = self.kbr.create_extended_report({ 'message': '', 'objects_created': [], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': qc_result_zip_info['shock_id'], 'name': qc_result_zip_info['index_html_file_name'], 'label': qc_result_zip_info['name'] }], 'report_object_name': 'qualimap_report' + str(uuid.uuid4()), 'workspace_name': output_workspace }) result['report_name'] = report_info['name'] result['report_ref'] = report_info['ref'] return result def run_bamqc(self, input_ref, input_info): # download the input and setup a working dir alignment_info = self.rau.download_alignment({'source_ref': input_ref}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) workdir = os.path.join(self.scratch_dir, 'qualimap_' + str(int(time.time() * 10000))) options = [ '-bam', bam_file_path, '-outdir', workdir, '-outformat', 'html' ] self.run_cli_command('bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap BAM QC', 'qualimapReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def run_multi_sample_qc(self, input_ref, input_info): # download the input and setup a working dir reads_alignment_info = self.get_alignments_from_set(input_ref) suffix = 'qualimap_' + str(int(time.time() * 10000)) workdir = os.path.join(self.scratch_dir, suffix) os.makedirs(workdir) input_file_path = self.create_multi_qualimap_cfg( reads_alignment_info, workdir) options = [ '-d', input_file_path, '-r', '-outdir', workdir, '-outformat', 'html' ] self.run_cli_command('multi-bamqc', options) package_info = self.package_output_folder( workdir, 'QualiMap_report', 'HTML report directory for QualiMap Multi-sample BAM QC', 'multisampleBamQcReport.html') return { 'qc_result_folder_path': workdir, 'qc_result_zip_info': package_info } def get_alignments_from_set(self, alignment_set_ref): set_data = self.set_api.get_reads_alignment_set_v1({ 'ref': alignment_set_ref, 'include_item_info': 1 }) items = set_data['data']['items'] reads_alignment_data = [] for alignment in items: alignment_info = self.rau.download_alignment( {'source_ref': alignment['ref']}) bam_file_path = self.find_my_bam_file( alignment_info['destination_dir']) label = None if 'label' in alignment: label = alignment['label'] reads_alignment_data.append({ 'bam_file_path': bam_file_path, 'ref': alignment['ref'], 'label': label, 'info': alignment['info'] }) return reads_alignment_data def create_multi_qualimap_cfg(self, reads_alignment_info, workdir): # Group by labels if there is at least one defined use_labels = False for alignment in reads_alignment_info: if alignment['label']: use_labels = True break # write the file input_file_path = os.path.join(workdir, 'multi_input.txt') input_file = open(input_file_path, 'w') name_lookup = {} for alignment in reads_alignment_info: name = alignment['info'][1] if name in name_lookup: name_lookup[name] += 1 name = name + '_' + str(name_lookup[name]) else: name_lookup[name] = 1 input_file.write(name + '\t' + alignment['bam_file_path']) if use_labels: if alignment['label']: input_file.write('\t' + alignment['label']) else: input_file.write('\tunlabeled') input_file.write('\n') input_file.close() return input_file_path def get_run_info(self, params): info = self.get_obj_info(params['input_ref']) obj_type = self.get_type_from_obj_info(info) if obj_type in ['KBaseRNASeq.RNASeqAlignment']: return {'mode': 'single', 'input_info': info} if obj_type in [ 'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet' ]: return {'mode': 'multi', 'input_info': info} raise ValueError('Object type of input_ref is not valid, was: ' + str(obj_type)) def validate_params(self, params): if 'input_ref' not in params: raise ValueError( 'required parameter field "input_ref" was not set') create_report = False if 'create_report' in params: if int(params['create_report']) == 1: if 'output_workspace' not in params: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) if not params['output_workspace']: raise ValueError( 'If "create_report" was set, then "output_workspace" is required' ) create_report = True params['create_report'] = create_report def run_cli_command(self, command, options, cwd=None): if command not in self.valid_commands: raise ValueError('Invalid QualiMap command: ' + str(command)) command = [self.QUALIMAP_PATH, command] + options print('Running: ' + ' '.join(command)) if not cwd: cwd = self.scratch_dir p = subprocess.Popen(command, cwd=cwd, shell=False) exitCode = p.wait() if (exitCode == 0): print('Success, exit code was: ' + str(exitCode)) else: raise ValueError('Error running command: ' + ' '.join(command) + '\n' + 'Exit Code: ' + str(exitCode)) def find_my_bam_file(self, dirpath): bam_path = None for f in os.listdir(dirpath): fullpath = os.path.join(dirpath, f) if os.path.isfile(fullpath) and f.lower().endswith('.bam'): if bam_path is not None: raise ValueError( 'Error! Too many BAM files were downloaded for this alignment!' ) bam_path = fullpath if bam_path is None: raise ValueError( 'Error! No BAM files were downloaded for this alignment!') return bam_path def package_output_folder(self, folder_path, zip_file_name, zip_file_description, index_html_file): ''' Simple utility for packaging a folder and saving to shock ''' output = self.dfu.file_to_shock({ 'file_path': folder_path, 'make_handle': 0, 'pack': 'zip' }) return { 'shock_id': output['shock_id'], 'name': zip_file_name, 'description': zip_file_description, 'index_html_file_name': index_html_file } def get_type_from_obj_info(self, info): return info[2].split('-')[0] def get_obj_info(self, ref): return self.ws.get_object_info3({'objects': [{ 'ref': ref }]})['infos'][0]
def exec_remove_adapters(self, ctx, params): """ :param params: instance of type "RemoveAdaptersParams" -> structure: parameter "output_workspace" of String, parameter "output_object_name" of String, parameter "input_reads" of type "ws_ref" (@ref ws), parameter "five_prime" of type "FivePrimeOptions" (unfortunately, we have to name the fields uniquely between 3' and 5' options due to the current implementation of grouped parameters) -> structure: parameter "adapter_sequence_5P" of String, parameter "anchored_5P" of type "boolean" (@range (0, 1)), parameter "three_prime" of type "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P" of String, parameter "anchored_3P" of type "boolean" (@range (0, 1)), parameter "error_tolerance" of Double, parameter "min_overlap_length" of Long, parameter "min_read_length" of Long, parameter "discard_untrimmed" of type "boolean" (@range (0, 1)) :returns: instance of type "exec_RemoveAdaptersResult" -> structure: parameter "report" of String, parameter "output_reads_ref" of String """ # ctx is the context object # return variables are: result #BEGIN exec_remove_adapters console = [] self.log(console, 'Running exec_remove_adapters() with parameters: ') self.log(console, "\n" + pformat(params)) self.log(console, "-----------------------------------------------\n") report = '' returnVal = dict() returnVal['output_reads_ref'] = None token = ctx['token'] wsClient = workspaceService(self.config['workspace-url'], token=token) ws = Workspace(self.config['workspace-url'], token=token) #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI setAPI_Client = SetAPI(url=self.config['service-wizard-url'], token=token) # for dynamic service headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # 0. param checks required_params = [ 'output_workspace', 'input_reads', 'output_object_name' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # 1. load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [str(params['input_reads'])] # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object # try: # object_info tuple [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': params['input_reads'] }]})[0] input_reads_obj_type = input_reads_obj_info[TYPE_I] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_type) # remove trailing version #input_reads_obj_version = input_reads_obj_info[VERSION_I] # this is object version, not type version except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(params['input_reads']) + ')' + str(e)) acceptable_types = [ "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet", "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary", "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary" ] if input_reads_obj_type not in acceptable_types: raise ValueError("Input reads of type: '" + input_reads_obj_type + "'. Must be one of " + ", ".join(acceptable_types)) # 3. Retrieve the set details # readsSet_ref_list = [] readsSet_names_list = [] readsSet_types_list = [] if "KBaseSets.ReadsSet" in input_reads_obj_type: try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': params['input_reads'], 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(params['input_reads']) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 TYPE_I = 2 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) this_type = readsLibrary_obj['info'][TYPE_I] this_type = re.sub('-[0-9]+\.[0-9]+$', "", this_type) # remove trailing version readsSet_types_list.append(this_type) elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type: sample_set = ws.get_objects2( {"objects": [{ "ref": params['input_reads'] }]})["data"][0]["data"] sample_refs = list() for i in range(len(sample_set["sample_ids"])): readsSet_ref_list.append(sample_set["sample_ids"][i]) sample_refs.append({"ref": sample_set["sample_ids"][i]}) info = ws.get_object_info3({"objects": sample_refs}) for j in range(len(info["infos"])): NAME_I = 1 TYPE_I = 2 readsSet_names_list.append(info["infos"][j][NAME_I]) sample_type = info["infos"][j][TYPE_I] sample_type = re.sub('-[0-9]+\.[0-9]+$', "", sample_type) # remove trailing version readsSet_types_list.append(sample_type) else: readsSet_ref_list = [params['input_reads']] readsSet_names_list = [params['output_object_name']] readsSet_types_list = [input_reads_obj_type] # 4. Iterate through readsLibrary memebers of set # report = '' cutadapt_readsSet_ref = None cutadapt_readsLib_refs = [] for reads_item_i, input_reads_library_ref in enumerate( readsSet_ref_list): exec_remove_adapters_OneLibrary_params = { 'output_workspace': params['output_workspace'], 'input_reads': input_reads_library_ref, 'reads_type': readsSet_types_list[reads_item_i] } if (input_reads_obj_type != "KBaseSets.ReadsSet" and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"): exec_remove_adapters_OneLibrary_params[ 'output_object_name'] = params['output_object_name'] else: exec_remove_adapters_OneLibrary_params[ 'output_object_name'] = readsSet_names_list[ reads_item_i] + "_cutadapt" optional_params = [ 'float error_tolerance', 'min_overlap_length', 'min_read_length', 'discard_untrimmed' ] optional_g_params = { 'five_prime': ['adapter_sequence_5P', 'anchored_5P'], 'three_prime': ['adapter_sequence_3P', 'anchored_3P'] } for arg in optional_params: if arg in params and params[arg] != None: exec_remove_adapters_OneLibrary_params[arg] = params[arg] for group in optional_g_params.keys(): if group in params and params[group] != None: exec_remove_adapters_OneLibrary_params[group] = dict() for arg in optional_g_params[group]: if arg in params[group] and params[group][arg] != None: exec_remove_adapters_OneLibrary_params[group][ arg] = params[group][arg] msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str( input_reads_library_ref) + " " + str( readsSet_names_list[reads_item_i]) + "\n" msg += "----------------------------------------------------------------------------\n" report += msg self.log(console, msg) # RUN exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary( ctx, exec_remove_adapters_OneLibrary_params)[0] report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n" cutadapt_readsLib_refs.append( exec_remove_adapters_OneLibrary_retVal['output_reads_ref']) # 5. Conclude # Just one Library if (input_reads_obj_type != "KBaseSets.ReadsSet" and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"): # create return output object result = { 'report': report, 'output_reads_ref': cutadapt_readsLib_refs[0], } # ReadsSet or SampleSet else: # save cutadapt readsSet some_cutadapt_output_created = False items = [] for i, lib_ref in enumerate(cutadapt_readsLib_refs): if lib_ref == None: #items.append(None) # can't have 'None' items in ReadsSet continue else: some_cutadapt_output_created = True try: label = input_readsSet_obj['data']['items'][i]['label'] except: NAME_I = 1 label = ws.get_object_info3( {'objects': [{ 'ref': lib_ref }]})['infos'][0][NAME_I] label = label + "_cutadapt" items.append({ 'ref': lib_ref, 'label': label #'data_attachment': , #'info': }) if some_cutadapt_output_created: reads_desc_ext = " + Cutadapt" #reads_name_ext = "_cutadapt" descText = "" reads_name_ext = "" try: descText = input_readsSet_obj['data']['description'] except: NAME_I = 1 descText = ws.get_object_info3( {'objects': [{ 'ref': params['input_reads'] }]})['infos'][0][NAME_I] descText = descText + reads_desc_ext output_readsSet_obj = {'description': descText, 'items': items} output_readsSet_name = str( params['output_object_name']) + reads_name_ext cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['output_workspace'], 'output_object_name': output_readsSet_name, 'data': output_readsSet_obj })['set_ref'] else: raise ValueError("No cutadapt output created") # create return output object result = { 'report': report, 'output_reads_ref': cutadapt_readsSet_ref } #END exec_remove_adapters # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method exec_remove_adapters return value ' + 'result is not type dict as required.') # return the results return [result]
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join( self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files ws = Workspace(self.ws_url) input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] # 0 obj_id objid - the numerical id of the object. # 1 obj_name name - the name of the object. # 2 type_string type - the type of the object. # 3 timestamp save_date - the save date of the object. # 4 obj_ver ver - the version of the object. # 5 username saved_by - the user that saved or copied the object. # 6 ws_id wsid - the workspace containing the object. # 7 ws_name workspace - the workspace containing the object. # 8 string chsum - the md5 checksum of the object. # 9 int size - the size of the object in bytes. # 10 usermeta meta - arbitrary user-supplied metadata about # the object. [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate mguClient with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in [ 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ]: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': input_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new( {'objects': [{ 'ref': this_assembly_ref }]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref + '): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i, assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({ 'input_ref': input_ref, 'save_to_shock': 0 })['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join(input_dir, fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError( 'Binned Assembly is empty for fasta_path: ' + str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2( {'objects': [{ 'ref': input_ref }]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch ' + str(input_ref) + ' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError( 'genome_ref not found for genome_id: ' + str(genome_id) + ' in genomeSet: ' + str(input_ref)) else: genomeSet_refs.append( genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i, this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2( {'objects': [{ 'ref': this_input_ref }]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError(msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) print(msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) print(msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i, assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({ 'ref': assembly_ref, 'filename': filename }) if not os.path.isfile(filename): raise ValueError( 'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil' ) # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError( 'Assembly or ContigSet is empty in filename: ' + str(filename)) # Unknown type slipped through # else: raise ValueError( 'Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return { 'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta }