def fetch_reads_refs_from_sampleset(self, ref, info, validated_params): """ Note: adapted from kbaseapps/kb_hisat2 - file_util.py From the given object ref, return a list of all reads objects that are a part of that object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows: { "ref": reads object reference, "condition": condition string associated with that reads object } The only one required is "ref", all other keys may or may not be present, based on the reads object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info for each reads object, but a single PairedEndLibrary may not have that info. If ref is already a Reads library, just returns a list with ref as a single element. """ obj_type = self.get_type_from_obj_info(info) refs = list() refs_for_ws_info = list() if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type: print("Looking up reads references in ReadsSet object") set_api = SetAPI(self.srv_wiz_url) reads_set = set_api.get_reads_set_v1({'ref': ref, 'include_item_info': 0, 'include_set_item_ref_paths': 1 }) for reads in reads_set["data"]["items"]: refs.append({'ref': reads['ref_path'], 'condition': reads['label'] }) refs_for_ws_info.append({'ref': reads['ref_path']}) else: raise ValueError("Unable to fetch reads reference from object {} " "which is a {}".format(ref, obj_type)) # get object info so we can name things properly infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos'] name_ext = '_alignment' if 'output_alignment_suffix' in validated_params \ and validated_params['output_alignment_suffix'] is not None: ext = validated_params['output_alignment_suffix'].replace(' ', '') if ext: name_ext = ext unique_name_lookup = {} for k in range(0, len(refs)): refs[k]['info'] = infos[k] name = infos[k][1] if name not in unique_name_lookup: unique_name_lookup[name] = 1 else: unique_name_lookup[name] += 1 name = name + '_' + str(unique_name_lookup[name]) name = name + name_ext refs[k]['alignment_output_name'] = name return refs
def exec_remove_adapters(self, ctx, params): """ :param params: instance of type "RemoveAdaptersParams" -> structure: parameter "output_workspace" of String, parameter "output_object_name" of String, parameter "input_reads" of type "ws_ref" (@ref ws), parameter "five_prime" of type "FivePrimeOptions" (unfortunately, we have to name the fields uniquely between 3' and 5' options due to the current implementation of grouped parameters) -> structure: parameter "adapter_sequence_5P" of String, parameter "anchored_5P" of type "boolean" (@range (0, 1)), parameter "three_prime" of type "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P" of String, parameter "anchored_3P" of type "boolean" (@range (0, 1)), parameter "error_tolerance" of Double, parameter "min_overlap_length" of Long, parameter "min_read_length" of Long, parameter "discard_untrimmed" of type "boolean" (@range (0, 1)) :returns: instance of type "exec_RemoveAdaptersResult" -> structure: parameter "report" of String, parameter "output_reads_ref" of String """ # ctx is the context object # return variables are: result #BEGIN exec_remove_adapters console = [] self.log(console, 'Running exec_remove_adapters() with parameters: ') self.log(console, "\n" + pformat(params)) self.log(console, "-----------------------------------------------\n") report = '' returnVal = dict() returnVal['output_reads_ref'] = None token = ctx['token'] wsClient = workspaceService(self.config['workspace-url'], token=token) ws = Workspace(self.config['workspace-url'], token=token) #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI setAPI_Client = SetAPI(url=self.config['service-wizard-url'], token=token) # for dynamic service headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # 0. param checks required_params = [ 'output_workspace', 'input_reads', 'output_object_name' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # 1. load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [str(params['input_reads'])] # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object # try: # object_info tuple [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': params['input_reads'] }]})[0] input_reads_obj_type = input_reads_obj_info[TYPE_I] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_type) # remove trailing version #input_reads_obj_version = input_reads_obj_info[VERSION_I] # this is object version, not type version except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(params['input_reads']) + ')' + str(e)) acceptable_types = [ "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet", "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary", "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary" ] if input_reads_obj_type not in acceptable_types: raise ValueError("Input reads of type: '" + input_reads_obj_type + "'. Must be one of " + ", ".join(acceptable_types)) # 3. Retrieve the set details # readsSet_ref_list = [] readsSet_names_list = [] readsSet_types_list = [] if "KBaseSets.ReadsSet" in input_reads_obj_type: try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': params['input_reads'], 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(params['input_reads']) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 TYPE_I = 2 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) this_type = readsLibrary_obj['info'][TYPE_I] this_type = re.sub('-[0-9]+\.[0-9]+$', "", this_type) # remove trailing version readsSet_types_list.append(this_type) elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type: sample_set = ws.get_objects2( {"objects": [{ "ref": params['input_reads'] }]})["data"][0]["data"] sample_refs = list() for i in range(len(sample_set["sample_ids"])): readsSet_ref_list.append(sample_set["sample_ids"][i]) sample_refs.append({"ref": sample_set["sample_ids"][i]}) info = ws.get_object_info3({"objects": sample_refs}) for j in range(len(info["infos"])): NAME_I = 1 TYPE_I = 2 readsSet_names_list.append(info["infos"][j][NAME_I]) sample_type = info["infos"][j][TYPE_I] sample_type = re.sub('-[0-9]+\.[0-9]+$', "", sample_type) # remove trailing version readsSet_types_list.append(sample_type) else: readsSet_ref_list = [params['input_reads']] readsSet_names_list = [params['output_object_name']] readsSet_types_list = [input_reads_obj_type] # 4. Iterate through readsLibrary memebers of set # report = '' cutadapt_readsSet_ref = None cutadapt_readsLib_refs = [] for reads_item_i, input_reads_library_ref in enumerate( readsSet_ref_list): exec_remove_adapters_OneLibrary_params = { 'output_workspace': params['output_workspace'], 'input_reads': input_reads_library_ref, 'reads_type': readsSet_types_list[reads_item_i] } if (input_reads_obj_type != "KBaseSets.ReadsSet" and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"): exec_remove_adapters_OneLibrary_params[ 'output_object_name'] = params['output_object_name'] else: exec_remove_adapters_OneLibrary_params[ 'output_object_name'] = readsSet_names_list[ reads_item_i] + "_cutadapt" optional_params = [ 'float error_tolerance', 'min_overlap_length', 'min_read_length', 'discard_untrimmed' ] optional_g_params = { 'five_prime': ['adapter_sequence_5P', 'anchored_5P'], 'three_prime': ['adapter_sequence_3P', 'anchored_3P'] } for arg in optional_params: if arg in params and params[arg] != None: exec_remove_adapters_OneLibrary_params[arg] = params[arg] for group in optional_g_params.keys(): if group in params and params[group] != None: exec_remove_adapters_OneLibrary_params[group] = dict() for arg in optional_g_params[group]: if arg in params[group] and params[group][arg] != None: exec_remove_adapters_OneLibrary_params[group][ arg] = params[group][arg] msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str( input_reads_library_ref) + " " + str( readsSet_names_list[reads_item_i]) + "\n" msg += "----------------------------------------------------------------------------\n" report += msg self.log(console, msg) # RUN exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary( ctx, exec_remove_adapters_OneLibrary_params)[0] report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n" cutadapt_readsLib_refs.append( exec_remove_adapters_OneLibrary_retVal['output_reads_ref']) # 5. Conclude # Just one Library if (input_reads_obj_type != "KBaseSets.ReadsSet" and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"): # create return output object result = { 'report': report, 'output_reads_ref': cutadapt_readsLib_refs[0], } # ReadsSet or SampleSet else: # save cutadapt readsSet some_cutadapt_output_created = False items = [] for i, lib_ref in enumerate(cutadapt_readsLib_refs): if lib_ref == None: #items.append(None) # can't have 'None' items in ReadsSet continue else: some_cutadapt_output_created = True try: label = input_readsSet_obj['data']['items'][i]['label'] except: NAME_I = 1 label = ws.get_object_info3( {'objects': [{ 'ref': lib_ref }]})['infos'][0][NAME_I] label = label + "_cutadapt" items.append({ 'ref': lib_ref, 'label': label #'data_attachment': , #'info': }) if some_cutadapt_output_created: reads_desc_ext = " + Cutadapt" #reads_name_ext = "_cutadapt" descText = "" reads_name_ext = "" try: descText = input_readsSet_obj['data']['description'] except: NAME_I = 1 descText = ws.get_object_info3( {'objects': [{ 'ref': params['input_reads'] }]})['infos'][0][NAME_I] descText = descText + reads_desc_ext output_readsSet_obj = {'description': descText, 'items': items} output_readsSet_name = str( params['output_object_name']) + reads_name_ext cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['output_workspace'], 'output_object_name': output_readsSet_name, 'data': output_readsSet_obj })['set_ref'] else: raise ValueError("No cutadapt output created") # create return output object result = { 'report': report, 'output_reads_ref': cutadapt_readsSet_ref } #END exec_remove_adapters # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method exec_remove_adapters return value ' + 'result is not type dict as required.') # return the results return [result]
def download_short_unpaired(self, console, token, wsname, short_unpaired_libraries): try: self.log(console, "Getting short unpaired reads.\n") ruClient = ReadsUtils(url=self.callbackURL, token=token) # first, unpack any ReadsSets into the actual SingleEndLibrary referencs reads_refs = [] # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for lib in short_unpaired_libraries: try: obj_id = { 'ref': lib if '/' in lib else (wsname + '/' + lib) } lib_obj_info = wsClient.get_object_info_new( {'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] # remove trailing version lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseSets.ReadsSet': # unpack it try: setAPIClient = SetAPI(url=self.serviceWizardURL, token=token) self.log(console, 'getting reads set ' + lib_ref) readsSet = setAPIClient.get_reads_set_v1({ 'ref': lib_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object: (' + lib_ref + ')\n' + str(e)) for readsLibrary in readsSet['data']['items']: reads_refs.append(readsLibrary['ref']) else: # use other reads objects "as is" reads_refs.append(lib_ref) except Exception as e: raise ValueError('Unable to get read library object: (' + str(lib) + ')' + str(e)) result = ruClient.download_reads({ 'read_libraries': reads_refs, 'interleaved': 'false' }) # combine outputs short_unpaired_path = os.path.join( self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq") self.log(console, "Combining short unpaired reads.\n") for reads_ref in reads_refs: files = result['files'][reads_ref]['files'] if 'fwd' in files: path = files['fwd'] if path.endswith('.gz'): cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path else: cmd = 'cat ' + path + ' >> ' + short_unpaired_path self.log(console, "command: " + cmd) cmdProcess = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) cmdProcess.wait() if cmdProcess.returncode != 0: raise ValueError('Error running ' + cmd) os.remove(path) else: raise ValueError('File ' + reads_ref + ' missing forward reads file') except Exception as e: raise ValueError('Unable to download short unpaired reads\n' + str(e)) return short_unpaired_path
def exec_megahit(self, ctx, params): """ :param params: instance of type "ExecMegaHitParams" (exec_megahit() Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as Input Creates Assembly object(s) as output. Will eventually also create AssemblySet object if input is a ReadsSet and not running a combined assembly Other vars same as run_megahit()) -> structure: parameter "workspace_name" of String, parameter "input_reads_ref" of String, parameter "output_contigset_name" of String, parameter "combined_assembly_flag" of Long, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_len" of Long :returns: instance of type "ExecMegaHitOutput" -> structure: parameter "report_text" of String, parameter "output_contigset_ref" of list of String """ # ctx is the context object # return variables are: output #BEGIN exec_megahit console = [] self.log(console, 'Running exec_megahit() with params=') self.log(console, "\n" + pformat(params)) #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' ### STEP 0: init token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token ### STEP 1: basic parameter checks + parsing required_params = [ 'workspace_name', 'input_reads_ref', 'output_contigset_name' ] for required_param in required_params: if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 2: determine if input is a ReadsLibrary or ReadsSet input_reads_ref = params['input_reads_ref'] input_reads_name = None try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_reads_ref }]})[0] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_info[TYPE_I]) # remove trailing version input_reads_name = input_reads_obj_info[NAME_I] except Exception as e: raise ValueError('Unable to get reads object from workspace: (' + input_reads_ref + ')' + str(e)) accepted_input_types = [ "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary" ] if input_reads_obj_type not in accepted_input_types: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) if input_reads_obj_type == "KBaseSets.ReadsSet": required_param = 'combined_assembly_flag' if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 3: get the list of library references if input_reads_obj_type == "KBaseFile.PairedEndLibrary": readsSet_ref_list = [input_reads_ref] readsSet_names_list = [input_reads_name] elif input_reads_obj_type == "KBaseSets.ReadsSet": readsSet_ref_list = [] readsSet_names_list = [] try: setAPI_Client = SetAPI( url=self.serviceWizardURL, token=ctx['token']) # for dynamic service #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # SDK local method except Exception as e: raise ValueError( "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '" + self.serviceWizardURL + "' token: '" + ctx['token'] + "'" + str(e)) #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e)) try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': input_reads_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_reads_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) else: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine if input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: self.log( console, "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES" ) # make dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) input_dir = os.path.join(self.scratch, 'input.' + str(timestamp)) if self.mac_mode: # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there input_dir = os.path.join(self.host_scratch, 'input.' + str(timestamp)) if not os.path.exists(input_dir): os.makedirs(input_dir) # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # start combined file read_buf_size = 65536 write_buf_size = 65536 combined_input_fwd_path = os.path.join(input_dir, 'input_reads_fwd.fastq') combined_input_rev_path = os.path.join(input_dir, 'input_reads_rev.fastq') combined_input_fwd_handle = open(combined_input_fwd_path, 'w', write_buf_size) combined_input_rev_handle = open(combined_input_rev_path, 'w', write_buf_size) # add libraries, one at a time for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] # append fwd self.log( console, "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) this_input_path = this_input_fwd_path cat_file_handle = combined_input_fwd_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file # append rev this_input_path = this_input_rev_path cat_file_handle = combined_input_rev_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file combined_input_fwd_handle.close() combined_input_rev_handle.close() ### STEP 5: finally run MegaHit_Sets exec_megahit_single_library_params = params output_assemblyset_contigset_paths = [] output_contigset_path = None # PairedEndLibrary if input_reads_obj_type == "KBaseFile.PairedEndLibrary": self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: " + str(input_reads_ref)) try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + input_reads_ref + ")\n" + str(e)) input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][ 'fwd'] input_rev_path = readsLibrary['files'][input_reads_ref]['files'][ 'rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet combined (already downloaded and combined fastqs) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: input_fwd_path = combined_input_fwd_path input_rev_path = combined_input_rev_path exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet uncombined (still have to download) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] == 0: # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # get libraries, one at a time, and run MegaHit_Sets output_assemblyset_contigset_paths = [] for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = this_input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = this_input_rev_path # the key line this_output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append( this_output_contigset_path) os.remove(this_input_fwd_path) # files can be really big os.remove(this_input_rev_path) # just in case we've confused ourselves else: raise ValueError("error in logic") ### STEP 6: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) output_contigset_refs = [] output_contigset_names = [] for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): if len(output_assemblyset_contigset_paths) == 1: assembly_name = params['output_contigset_name'] else: assembly_name = readsSet_names_list[i] + '-' + params[ 'output_contigset_name'] this_output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': this_output_contigset_path }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_name }) output_contigset_refs.append(this_output_data_ref) output_contigset_names.append(assembly_name) ### STEP 7: generate the report text # compute a simple contig length distribution for the report report = '' for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[ i] + "\n" report += "-------------------------------------------------------------\n" report += "\n" lengths = [] for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'): lengths.append(len(seq_record.seq)) report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + output_contigset_names[i] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' ### STEP 8: contruct the output to send back output = { 'report_text': report, 'output_contigset_refs': output_contigset_refs } #END exec_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method exec_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
class DataStagingUtils(object): def __init__(self, config, ctx): self.ctx = ctx self.scratch = os.path.abspath(config['scratch']) self.ws_url = config['workspace-url'] self.serviceWizardURL = config['srv-wiz-url'] self.callbackURL = config['SDK_CALLBACK_URL'] if not os.path.exists(self.scratch): os.makedirs(self.scratch) self.SE_flag = 'SE' self.PE_flag = 'PE' SERVICE_VER = 'release' # readsUtils_Client try: self.readsUtils_Client = ReadsUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate readsUtils_Client with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI self.setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) def expand_input(self, input_refs): ''' Expand input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # expand any sets and build a non-redundant list of reads input objs ws = Workspace(self.ws_url) expanded_input = [] input_ref_seen = dict() SE_types = [ 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary' ] PE_types = [ 'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary' ] [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for input_ref in input_refs: input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] # ReadsSet if type_name in ['KBaseSets.ReadsSet']: try: input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: this_reads_ref = readsLibrary_obj['ref'] if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = readsLibrary_obj['info'][NAME_I] reads_item_type = readsLibrary_obj['info'][TYPE_I] reads_item_type = re.sub( '-[0-9]+\.[0-9]+$', "", reads_item_type) # remove trailing version if reads_item_type in PE_types: this_reads_type = self.PE_flag elif reads_item_type in SE_types: this_reads_type = self.SE_flag else: raise ValueError("Can't handle read item type '" + reads_item_type + "' obj_name: '" + this_reads_name + " in Set: '" + str(input_ref) + "'") expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # SingleEnd Library elif type_name in SE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.SE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # PairedEnd Library elif type_name in PE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.PE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) else: raise ValueError("Illegal type in input_refs: " + str(obj_name) + " (" + str(input_ref) + ") is of type: '" + str(type_name) + "'") return expanded_input def stage_input(self, input_item=None, subsample_percent=10, subsample_replicates=1, subsample_seed=1, fasta_file_extension='fastq'): ''' Stage input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet This method creates a directory in the scratch area with the set of Fasta/Fastq files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq') staged_input {"input_dir": '...'} ''' # init staged_input = dict() replicate_input = [] # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'input_reads_' + suffix) if not os.path.exists(input_dir): os.makedirs(input_dir) # # Download reads # # Paired End Lib if input_item['type'] == self.PE_flag: try: readsLibrary = self.readsUtils_Client.download_reads({ 'read_libraries': [input_item['ref']], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] input_rev_file_path = readsLibrary['files'][ input_item['ref']]['files']['rev'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) rev_filename = os.path.join( input_dir, input_item['name'] + '.rev.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) if input_rev_file_path != rev_filename: shutil.move(input_rev_file_path, rev_filename) input_item['fwd_file'] = fwd_filename input_item['rev_file'] = rev_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) if not os.path.isfile(rev_filename): raise ValueError('Error generating reads file ' + rev_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(rev_filename)) # Single End Lib elif input_item['type'] == self.SE_flag: try: readsLibrary = self.readsUtils_Client.download_reads( {'read_libraries': [input_item['ref']]}) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) input_item['fwd_file'] = fwd_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) else: raise ValueError("No type set for input library " + str(input_item['name']) + " (" + str(input_item['ref']) + ")") # # Subsample # if subsample_percent == 100: replicate_input = [input_item] else: replicate_input = self._randomly_subsample_reads( input_item, subsample_percent=subsample_percent, subsample_replicates=subsample_replicates, subsample_seed=subsample_seed) # free up disk os.remove(input_item['fwd_file']) if input_item['type'] == self.PE_flag: os.remove(input_item['rev_file']) # return input file info #staged_input['input_dir'] = input_dir #staged_input['folder_suffix'] = suffix staged_input['replicate_input'] = replicate_input return staged_input def _randomly_subsample_reads(self, input_item=None, subsample_percent=100, subsample_replicates=1, subsample_seed=1): replicate_files = [] split_num = subsample_replicates # for now can only do percentage instead of raw cnt of reads per subsample use_reads_num = False use_reads_perc = True reads_num = 0 # not used. subsample_percent used instead # init randomizer random.seed(subsample_seed) # Paired End # if input_item['type'] == self.PE_flag: print("SUBSAMPLING PE library " + input_item['name']) # DEBUG # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) input_rev_path = re.sub("\.fastq$", "", input_item['rev_file']) input_rev_path = re.sub("\.FASTQ$", "", input_rev_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" output_rev_paired_file_path_base = input_rev_path + "_rev_paired" # set up for file io total_paired_reads = 0 total_unpaired_fwd_reads = 0 total_unpaired_rev_reads = 0 total_paired_reads_by_set = [] fwd_ids = dict() paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 1000000 # read fwd file to get fwd ids # rec_cnt = 0 # DEBUG print("GETTING IDS") # DEBUG with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) fwd_ids[read_id] = True # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 # read reverse to determine paired print("DETERMINING PAIRED IDS") # DEBUG with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if fwd_ids[read_id]: paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL PAIRED READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # split fwd paired print("WRITING FWD SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 total_paired_reads_by_set[lib_i] += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_fwd_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " FWD recs processed") # split rev paired print("WRITING REV SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_rev_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_rev_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " REV recs processed") # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n" report += "TOTAL UNPAIRED FWD READS (discarded): " + str( total_unpaired_fwd_reads) + "\n" report += "TOTAL UNPAIRED REV READS (discarded): " + str( total_unpaired_rev_reads) + "\n" report += "\n" for lib_i in range(split_num): report += "PAIRED READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return # for replicate_i,replicate_item in enumerate(replicate_files): # replicate_input.append({'fwd_file': replicate_item['fwd_file'], # 'type': input_item['type'], # 'name': input_item['name']+"-"+str(replicate_i) # }) # if input_item['type'] == self.PE_flag: # replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file'] print("MAKING REPLICATE OBJECT") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0 \ or not os.path.isfile (output_rev_paired_file_path) \ or os.path.getsize (output_rev_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) # SingleEndLibrary # elif input_item['type'] == self.SE_flag: print("SUBSAMPLING SE library " + input_item['name']) # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" # get "paired" ids print("DETERMINING IDS") # DEBUG paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 100000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if read_id in paired_ids: raise ValueError("repeat read_id: " + read_id) paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # set up for file io total_paired_reads = 0 total_paired_reads_by_set = [] paired_buf_size = 1000000 # split reads print("WRITING SPLIT SINGLE END READS") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 recs_beep_n = 1000000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") total_paired_reads += 1 if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id rec_buf.append(line) # last rec if len(rec_buf) > 0: if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL READS: " + str(total_paired_reads) + "\n" for lib_i in range(split_num): report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return print("MAKING REPLICATE OBJECTS") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) else: raise ValueError("unknown ReadLibrary type:" + str(input_item['type']) + " for readslibrary: " + input_item['name']) return replicate_files def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1): ''' counts the number of non-header, non-whitespace characters in a FASTA file ''' seq_len = 0 with open(fasta_path, 'r', 0) as fasta_handle: for line in fasta_handle: line = line.strip() if line.startswith('>'): continue line = line.replace(' ', '') seq_len += len(line) if seq_len >= min_fasta_len: return True return False