def _stage_input_file(self, cutadapt_runner, ref, reads_type): ru = ReadsUtils(self.callbackURL) if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary': input_file_info = ru.download_reads({ 'read_libraries': [ref], 'interleaved': 'true' })['files'][ref] elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary': input_file_info = ru.download_reads({'read_libraries': [ref]})['files'][ref] else: raise ValueError("Can't download_reads() for object type: '" + str(reads_type) + "'") input_file_info['input_ref'] = ref file_location = input_file_info['files']['fwd'] # DEBUG #with open (file_location, 'r', 0) as fasta_file: # for line in fasta_file.readlines(): # print ("LINE: '"+line+"'\n") interleaved = False if input_file_info['files']['type'] == 'interleaved': interleaved = True cutadapt_runner.set_interleaved(interleaved) cutadapt_runner.set_input_file(file_location) return input_file_info
def prepare_single_run(self, input_info, assembly_or_genome_ref, bowtie2_index_info, ws_for_cache): ''' Given a reads ref and an assembly, setup the bowtie2 index ''' # first setup the bowtie2 index of the assembly input_configuration = {'bowtie2_index_info': bowtie2_index_info} if not bowtie2_index_info: bowtie2IndexBuilder = Bowtie2IndexBuilder(self.scratch_dir, self.workspace_url, self.callback_url, self.srv_wiz_url, self.provenance) index_result = bowtie2IndexBuilder.get_index({'ref': assembly_or_genome_ref, 'ws_for_cache': ws_for_cache}) input_configuration['bowtie2_index_info'] = index_result # next download the reads read_lib_ref = input_info['ref'] read_lib_info = input_info['info'] reads_params = {'read_libraries': [read_lib_ref], 'interleaved': 'false', 'gzipped': None} ru = ReadsUtils(self.callback_url) reads = ru.download_reads(reads_params)['files'] input_configuration['reads_lib_type'] = self.get_type_from_obj_info(read_lib_info).split('.')[1] input_configuration['reads_files'] = reads[read_lib_ref] input_configuration['reads_lib_ref'] = read_lib_ref return input_configuration
def get_ea_utils_result (self,refid, input_params): ref = [refid] DownloadReadsParams={'read_libraries':ref} dfUtil = ReadsUtils(self.callbackURL) x=dfUtil.download_reads(DownloadReadsParams) report = '' fwd_file = None rev_file = None fwd_file = x['files'][ref[0]]['files']['fwd'] otype = x['files'][ref[0]]['files']['otype'] #case of interleaved if (otype == 'interleaved'): report += self.get_report_string (fwd_file) #case of separate pair if (otype == 'paired'): report += self.get_report_string (fwd_file) rev_file = x['files'][ref[0]]['files']['rev'] report += self.get_report_string (rev_file) #case of single end if (otype == 'single'): report += self.get_report_string (fwd_file) #print report return report
def get_reads_RU(self, ctx, refs, console): readcli = ReadsUtils(self.callbackURL, token=ctx['token'], service_ver='dev') typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({ 'read_libraries': refs, 'interleaved': 'true', 'gzipped': None })['files'] except ServerError as se: self.log(console, 'logging stacktrace from dynamic client error') self.log(console, se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log(console, 'Got reads data from converter:\n' + pformat(reads)) return reads
def fetch_reads_from_reference(ref, callback_url): """ Fetch a FASTQ file (or 2 for paired-end) from a reads reference. Returns the following structure: { "style": "paired", "single", or "interleaved", "file_fwd": path_to_file, "file_rev": path_to_file, only if paired end, "object_ref": reads reference for downstream convenience. } """ try: print("Fetching reads from object {}".format(ref)) reads_client = ReadsUtils(callback_url) reads_dl = reads_client.download_reads({ "read_libraries": [ref], "interleaved": "false" }) pprint(reads_dl) reads_files = reads_dl['files'][ref]['files'] ret_reads = { "object_ref": ref, "style": reads_files["type"], "file_fwd": reads_files["fwd"] } if reads_files.get("rev", None) is not None: ret_reads["file_rev"] = reads_files["rev"] return ret_reads except: print("Unable to fetch a file from expected reads object {}".format(ref)) raise
def get_input_reads(self, params, token): print('in get input reads') wsname = params[self.PARAM_IN_WS] libfile_args = params[self.PARAM_IN_LIBFILE_ARGS] obj_ids = [] for libarg in libfile_args: read_name = libarg[self.PARAM_IN_LIBRARY] r = read_name if '/' in read_name else (wsname + '/' + read_name) obj_ids.append({'ref': r}) libarg['ref_library'] = r if self.PARAM_IN_UNPAIRED in libarg and libarg[self.PARAM_IN_UNPAIRED] is not None: read_name = libarg[self.PARAM_IN_UNPAIRED] r = read_name if '/' in read_name else (wsname + '/' + read_name) obj_ids.append({'ref': r}) libarg['ref_unpaired'] = r ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=token, service_ver='dev') typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': reads_params, 'interleaved': 'true', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) print("READS:") pprint(reads) return reads
def download_interleaved_reads(callback_url, reads_upa): ru = ReadsUtils(callback_url) reads_info = ru.download_reads({ 'read_libraries': [reads_upa], 'interleaved': 'true', 'gzipped': None })['files'][reads_upa] return reads_info
def run_mash_sketch(self, ctx, params): """ Generate a sketch file from a fasta/fastq file :param params: instance of type "MashSketchParams" (* * Pass in **one of** input_path, assembly_ref, or reads_ref * input_path - string - local file path to an input fasta/fastq * assembly_ref - string - workspace reference to an Assembly type * reads_ref - string - workspace reference to a Reads type * Optionally, pass in a boolean indicating whether you are using paired-end reads. * paired_ends - boolean - whether you are passing in paired ends) -> structure: parameter "input_path" of String, parameter "assembly_ref" of String, parameter "reads_ref" of String, parameter "paired_ends" of type "boolean" (params: input_upa: workspace reference to an assembly object workspace_name: name of current workspace search_db: database to search n_max_results: number of results to return, integer between 1 and 100) :returns: instance of type "MashSketchResults" (* * Returns the local scratch file path of the generated sketch file. * Will have the extension '.msh') -> structure: parameter "sketch_path" of String """ # ctx is the context object # return variables are: results #BEGIN run_mash_sketch if 'reads_ref' in params: reads_utils = ReadsUtils(self.callbackURL) result = reads_utils.download_reads({ 'read_libraries': [params['reads_ref']], 'interleaved': 'true' }) input_path = result['files'][params['reads_ref']]['files']['fwd'] elif 'assembly_ref' in params: assembly_util = AssemblyUtil(self.callbackURL) result = assembly_util.get_assembly_as_fasta( {'ref': params['assembly_ref']}) input_path = result['path'] elif 'input_path' in params: input_path = params['input_path'] else: raise ValueError( 'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.' ) mash_utils = MashUtils(self.config, self.auth_token) output_file_path = mash_utils.mash_sketch( input_path, paired_ends=params.get('paired_ends')) results = {'sketch_path': output_file_path} #END run_mash_sketch # At some point might do deeper type checking... if not isinstance(results, dict): raise ValueError('Method run_mash_sketch return value ' + 'results is not type dict as required.') # return the results return [results]
def _upload_reads(self, refid, callbackURL, input_params): ref = [refid] DownloadReadsParams = {'read_libraries': ref} dfUtil = ReadsUtils(callbackURL) x = dfUtil.download_reads(DownloadReadsParams) uploadReadParams = {} fwd_file = x['files'][ref[0]]['files']['fwd'] otype = x['files'][ref[0]]['files']['otype'] #case of interleaved if (otype == 'interleaved'): uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': '', 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'], 'interleaved': 1 } #case of separate pair if (otype == 'paired'): rev_file = x['files'][ref[0]]['files']['rev'] uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': rev_file, 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'] } #case of single end if (otype == 'single'): uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': '', 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'] } y = dfUtil.upload_reads(uploadReadParams) return y['obj_ref']
def ru_reads_download(logger, ref, tdir, token): check_disk_space(logger) logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir)) ru = ReadsUtils(url=os.environ['SDK_CALLBACK_URL'], token=token) ds = ru.download_reads({"read_libraries" : [ref], "interleaved" : "false"}) logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir)) #ds['fwd'] = os.path.join(tdir, trim_gz(ds['files'][ref]['files']['fwd_name'])) ds['fwd'] = os.path.join(tdir, os.path.basename(ds['files'][ref]['files']['fwd'])) os.rename(ds['files'][ref]['files']['fwd'],ds['fwd']) if ds['files'][ref]['files']['type'] == 'paired': if ds['files'][ref]['files']['rev_name'] is None: ds['rev'] = os.path.join(tdir, 'rev.fastq') else: ds['rev'] = os.path.join(tdir, os.path.basename(ds['files'][ref]['files']['rev'])) os.rename(ds['files'][ref]['files']['rev'],ds['rev']) logger.info("{0} will be downloaded and transferred to {1}".format(ref,tdir)) return ds
def fetch_reads_files(self, reads_upas): """ From a list of reads UPAs, uses ReadsUtils to fetch the reads as files. Returns them as a dictionary from reads_upa -> filename """ if reads_upas is None: raise ValueError("reads_upas must be a list of UPAs") if len(reads_upas) == 0: raise ValueError("reads_upas must contain at least one UPA") ru = ReadsUtils(self.callback_url) reads_info = ru.download_reads(({ 'read_libraries': reads_upas, 'interleaved': 'true', 'gzipped': None }))['files'] file_set = dict() for reads in reads_info: file_set[reads] = reads_info[reads]['files']['fwd'] return file_set
def run_idba_ud(self, ctx, params): """ Run IDBA on paired end libraries :param params: instance of type "idba_ud_Params" (Input parameters for running idba_ud. string workspace_name - the name of the workspace from which to take input and store output. list<paired_end_lib> read_libraries - Illumina PairedEndLibrary files to assemble. string output_contigset_name - the name of the output contigset min_contig_length - minimum length of contigs to output, default is 2000 @optional kval_args) -> structure: parameter "workspace_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "output_contigset_name" of String, parameter "min_contig_length" of Long, parameter "kval_args" of type "kval_args_type" (Additional parameters: k values for idba_ud. (Note: The UI elements for these values have been removed, based on feedback)) -> structure: parameter "mink_arg" of Long, parameter "maxk_arg" of Long, parameter "step_arg" of Long :returns: instance of type "idba_ud_Output" (Output parameters for IDBA run. string report_name - the name of the KBaseReport.Report workspace object. string report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_idba_ud print("=================== IN run_idba_ud") print("PARAMS: ") pprint(params) print("============================ END OF PARAMS: ") # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_idba_ud with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=ctx['token']) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) self.check_reads(reads, reftoname) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] print("REF:" + str(ref)) print("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'paired': reads_data.append({ 'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'single': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech }) else: raise ValueError('Something is very wrong with read lib' + reads_name) # set the output location timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) outdir = os.path.join(self.scratch, 'IDBA_dir' + str(timestamp)) idba_out = self.exec_idba_ud(reads_data, params, outdir) self.log('IDBA output dir: ' + idba_out) # parse the output and save back to KBase output_contigs = os.path.join(idba_out, 'contig.fa') self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='dev') if params.get('min_contig_length', 0) > 0: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': params['min_contig_length'] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report( output_contigs + '.filtered.fa', params, wsname) else: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report(output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref} #END run_idba_ud # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_idba_ud return value ' + 'output is not type dict as required.') # return the results return [output]
def exec_megahit(self, ctx, params): """ :param params: instance of type "ExecMegaHitParams" (exec_megahit() Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as Input Creates Assembly object(s) as output. Will eventually also create AssemblySet object if input is a ReadsSet and not running a combined assembly Other vars same as run_megahit()) -> structure: parameter "workspace_name" of String, parameter "input_reads_ref" of String, parameter "output_contigset_name" of String, parameter "combined_assembly_flag" of Long, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_len" of Long :returns: instance of type "ExecMegaHitOutput" -> structure: parameter "report_text" of String, parameter "output_contigset_ref" of list of String """ # ctx is the context object # return variables are: output #BEGIN exec_megahit console = [] self.log(console, 'Running exec_megahit() with params=') self.log(console, "\n" + pformat(params)) #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' ### STEP 0: init token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token ### STEP 1: basic parameter checks + parsing required_params = [ 'workspace_name', 'input_reads_ref', 'output_contigset_name' ] for required_param in required_params: if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 2: determine if input is a ReadsLibrary or ReadsSet input_reads_ref = params['input_reads_ref'] input_reads_name = None try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_reads_ref }]})[0] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_info[TYPE_I]) # remove trailing version input_reads_name = input_reads_obj_info[NAME_I] except Exception as e: raise ValueError('Unable to get reads object from workspace: (' + input_reads_ref + ')' + str(e)) accepted_input_types = [ "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary" ] if input_reads_obj_type not in accepted_input_types: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) if input_reads_obj_type == "KBaseSets.ReadsSet": required_param = 'combined_assembly_flag' if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 3: get the list of library references if input_reads_obj_type == "KBaseFile.PairedEndLibrary": readsSet_ref_list = [input_reads_ref] readsSet_names_list = [input_reads_name] elif input_reads_obj_type == "KBaseSets.ReadsSet": readsSet_ref_list = [] readsSet_names_list = [] try: setAPI_Client = SetAPI( url=self.serviceWizardURL, token=ctx['token']) # for dynamic service #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # SDK local method except Exception as e: raise ValueError( "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '" + self.serviceWizardURL + "' token: '" + ctx['token'] + "'" + str(e)) #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e)) try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': input_reads_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_reads_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) else: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine if input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: self.log( console, "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES" ) # make dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) input_dir = os.path.join(self.scratch, 'input.' + str(timestamp)) if self.mac_mode: # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there input_dir = os.path.join(self.host_scratch, 'input.' + str(timestamp)) if not os.path.exists(input_dir): os.makedirs(input_dir) # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # start combined file read_buf_size = 65536 write_buf_size = 65536 combined_input_fwd_path = os.path.join(input_dir, 'input_reads_fwd.fastq') combined_input_rev_path = os.path.join(input_dir, 'input_reads_rev.fastq') combined_input_fwd_handle = open(combined_input_fwd_path, 'w', write_buf_size) combined_input_rev_handle = open(combined_input_rev_path, 'w', write_buf_size) # add libraries, one at a time for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] # append fwd self.log( console, "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) this_input_path = this_input_fwd_path cat_file_handle = combined_input_fwd_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file # append rev this_input_path = this_input_rev_path cat_file_handle = combined_input_rev_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file combined_input_fwd_handle.close() combined_input_rev_handle.close() ### STEP 5: finally run MegaHit_Sets exec_megahit_single_library_params = params output_assemblyset_contigset_paths = [] output_contigset_path = None # PairedEndLibrary if input_reads_obj_type == "KBaseFile.PairedEndLibrary": self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: " + str(input_reads_ref)) try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + input_reads_ref + ")\n" + str(e)) input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][ 'fwd'] input_rev_path = readsLibrary['files'][input_reads_ref]['files'][ 'rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet combined (already downloaded and combined fastqs) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: input_fwd_path = combined_input_fwd_path input_rev_path = combined_input_rev_path exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet uncombined (still have to download) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] == 0: # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # get libraries, one at a time, and run MegaHit_Sets output_assemblyset_contigset_paths = [] for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = this_input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = this_input_rev_path # the key line this_output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append( this_output_contigset_path) os.remove(this_input_fwd_path) # files can be really big os.remove(this_input_rev_path) # just in case we've confused ourselves else: raise ValueError("error in logic") ### STEP 6: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) output_contigset_refs = [] output_contigset_names = [] for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): if len(output_assemblyset_contigset_paths) == 1: assembly_name = params['output_contigset_name'] else: assembly_name = readsSet_names_list[i] + '-' + params[ 'output_contigset_name'] this_output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': this_output_contigset_path }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_name }) output_contigset_refs.append(this_output_data_ref) output_contigset_names.append(assembly_name) ### STEP 7: generate the report text # compute a simple contig length distribution for the report report = '' for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[ i] + "\n" report += "-------------------------------------------------------------\n" report += "\n" lengths = [] for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'): lengths.append(len(seq_record.seq)) report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + output_contigset_names[i] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' ### STEP 8: contruct the output to send back output = { 'report_text': report, 'output_contigset_refs': output_contigset_refs } #END exec_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method exec_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 min_k - minimum kmer size (<= 127), must be odd number, default 21 max_k - maximum kmer size (<= 127), must be odd number, default 99 k_step - increment of kmer size of each iteration (<= 28), must be even number, default 10 k_list - list of kmer size (all must be odd, in the range 15-127, increment <= 28); override `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = {'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError('min_contig_length parameter must be a non-negative integer') megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: raise ValueError('Error running MEGAHIT, return code: ' + str(retcode) + '\n') output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': {'path': output_contigs}, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({'files': [{'path': output_contigs, 'label': params['output_contigset_name']}]}) except QUASTError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report( {'message': report, 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except _RepError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref']} #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def runFastQC(self, ctx, input_params): """ :param input_params: instance of type "FastQCParams" -> structure: parameter "input_ws" of String, parameter "input_file" of String, parameter "input_file_ref" of String :returns: instance of type "FastQCOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: reported_output #BEGIN runFastQC token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) headers = {'Authorization': 'OAuth ' + token} uuid_string = str(uuid.uuid4()) read_file_path = self.scratch + "/" + uuid_string os.mkdir(read_file_path) input_file_ref = self._get_input_file_ref_from_params(input_params) library = None try: library = wsClient.get_objects2( {'objects': [{ 'ref': input_file_ref }]})['data'][0] except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + input_file_ref + ')' + str(e)) download_read_params = {'read_libraries': [], 'interleaved': "false"} if ("SingleEnd" in library['info'][2] or "PairedEnd" in library['info'][2]): download_read_params['read_libraries'].append(library['info'][7] + "/" + library['info'][1]) elif ("SampleSet" in library['info'][2]): for sample_id in library['data']['sample_ids']: download_read_params['read_libraries'].append( library['info'][7] + "/" + sample_id) # pprint(download_read_params) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) ret = ru.download_reads(download_read_params) # pprint(ret) read_file_list = list() for file in ret['files']: files = ret['files'][file]['files'] fwd_name = files['fwd'].split('/')[-1] fwd_name = fwd_name.replace('.gz', '') shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name)) read_file_list.append(os.path.join(read_file_path, fwd_name)) if (files['rev'] is not None): rev_name = files['rev'].split('/')[-1] rev_name = rev_name.replace('.gz', '') shutil.move(files['rev'], os.path.join(read_file_path, rev_name)) read_file_list.append(os.path.join(read_file_path, rev_name)) subprocess.check_output(["fastqc"] + read_file_list) report = "Command run: " + " ".join(["fastqc"] + read_file_list) output_html_files = list() output_zip_files = list() first_file = "" html_string = "" html_count = 0 with open('/kb/data/index_start.txt', 'r') as start_file: html_string = start_file.read() for file in os.listdir(read_file_path): label = ".".join(file.split(".")[1:]) if (file.endswith(".zip")): output_zip_files.append({ 'path': read_file_path + "/" + file, 'name': file, 'label': label, 'description': 'Zip file generated by fastqc that contains original images seen in the report' }) if (file.endswith(".html")): if (first_file == ""): first_file = file output_html_files.append({ 'path': read_file_path + "/" + file, 'name': file, 'label': label, 'description': 'HTML file generated by fastqc that contains report on quality of reads' }) html_string += " <button data-button=\"page " + str( html_count) + "\" data-page=\"" + file + "\">Page " + str( html_count + 1) + "</button>\n" html_count += 1 html_string += " </div> </div> <div id=\"body\">\n <iframe id=\"content\" style=\"width: 100%; border: none; \" src=\"" + first_file + "\"></iframe>\n </div>" with open('/kb/data/index_end.txt', 'r') as end_file: html_string += end_file.read() with open(read_file_path + "/index.html", 'w') as index_file: index_file.write(html_string) output_html_files.append({ 'path': read_file_path + "/index.html", 'name': "index.html", 'label': "index.html", 'description': 'HTML file generated by fastqc that contains report on quality of reads' }) report_params = { 'objects_created': [], # 'message' : report, # 'direct_html' : html_string, 'direct_html_link_index': html_count, 'file_links': output_zip_files, 'html_links': output_html_files, 'workspace_name': input_params['input_ws'], 'report_object_name': 'kb_fastqc_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=token) output = kbase_report_client.create_extended_report(report_params) reported_output = { 'report_name': output['name'], 'report_ref': output['ref'] } #Remove temp reads directory shutil.rmtree(read_file_path, ignore_errors=True) #END runFastQC # At some point might do deeper type checking... if not isinstance(reported_output, dict): raise ValueError('Method runFastQC return value ' + 'reported_output is not type dict as required.') # return the results return [reported_output]
class DataStagingUtils(object): def __init__(self, config, ctx): self.ctx = ctx self.scratch = os.path.abspath(config['scratch']) self.ws_url = config['workspace-url'] self.serviceWizardURL = config['srv-wiz-url'] self.callbackURL = config['SDK_CALLBACK_URL'] if not os.path.exists(self.scratch): os.makedirs(self.scratch) self.SE_flag = 'SE' self.PE_flag = 'PE' SERVICE_VER = 'release' # readsUtils_Client try: self.readsUtils_Client = ReadsUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError( 'Unable to instantiate readsUtils_Client with callbackURL: ' + self.callbackURL + ' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI self.setAPI_Client = SetAPI( url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError( 'Unable to instantiate setAPI_Client with serviceWizardURL: ' + self.serviceWizardURL + ' ERROR: ' + str(e)) def expand_input(self, input_refs): ''' Expand input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # expand any sets and build a non-redundant list of reads input objs ws = Workspace(self.ws_url) expanded_input = [] input_ref_seen = dict() SE_types = [ 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary' ] PE_types = [ 'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary' ] [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for input_ref in input_refs: input_info = ws.get_object_info3({'objects': [{ 'ref': input_ref }]})['infos'][0] obj_name = input_info[NAME_I] type_name = input_info[TYPE_I].split('-')[0] # ReadsSet if type_name in ['KBaseSets.ReadsSet']: try: input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({ 'ref': input_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: this_reads_ref = readsLibrary_obj['ref'] if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = readsLibrary_obj['info'][NAME_I] reads_item_type = readsLibrary_obj['info'][TYPE_I] reads_item_type = re.sub( '-[0-9]+\.[0-9]+$', "", reads_item_type) # remove trailing version if reads_item_type in PE_types: this_reads_type = self.PE_flag elif reads_item_type in SE_types: this_reads_type = self.SE_flag else: raise ValueError("Can't handle read item type '" + reads_item_type + "' obj_name: '" + this_reads_name + " in Set: '" + str(input_ref) + "'") expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # SingleEnd Library elif type_name in SE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.SE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) # PairedEnd Library elif type_name in PE_types: this_reads_ref = input_ref if this_reads_ref in input_ref_seen: continue input_ref_seen[this_reads_ref] = True this_reads_name = obj_name this_reads_type = self.PE_flag expanded_input.append({ 'ref': this_reads_ref, 'name': this_reads_name, 'type': this_reads_type }) else: raise ValueError("Illegal type in input_refs: " + str(obj_name) + " (" + str(input_ref) + ") is of type: '" + str(type_name) + "'") return expanded_input def stage_input(self, input_item=None, subsample_percent=10, subsample_replicates=1, subsample_seed=1, fasta_file_extension='fastq'): ''' Stage input based on an input data reference for Kaiju input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet This method creates a directory in the scratch area with the set of Fasta/Fastq files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq') staged_input {"input_dir": '...'} ''' # init staged_input = dict() replicate_input = [] # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' # generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'input_reads_' + suffix) if not os.path.exists(input_dir): os.makedirs(input_dir) # # Download reads # # Paired End Lib if input_item['type'] == self.PE_flag: try: readsLibrary = self.readsUtils_Client.download_reads({ 'read_libraries': [input_item['ref']], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] input_rev_file_path = readsLibrary['files'][ input_item['ref']]['files']['rev'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) rev_filename = os.path.join( input_dir, input_item['name'] + '.rev.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) if input_rev_file_path != rev_filename: shutil.move(input_rev_file_path, rev_filename) input_item['fwd_file'] = fwd_filename input_item['rev_file'] = rev_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) if not os.path.isfile(rev_filename): raise ValueError('Error generating reads file ' + rev_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(rev_filename)) # Single End Lib elif input_item['type'] == self.SE_flag: try: readsLibrary = self.readsUtils_Client.download_reads( {'read_libraries': [input_item['ref']]}) except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_item['ref']) + ")\n" + str(e)) input_fwd_file_path = readsLibrary['files'][ input_item['ref']]['files']['fwd'] fwd_filename = os.path.join( input_dir, input_item['name'] + '.fwd.' + fasta_file_extension) if input_fwd_file_path != fwd_filename: shutil.move(input_fwd_file_path, fwd_filename) input_item['fwd_file'] = fwd_filename if not os.path.isfile(fwd_filename): raise ValueError('Error generating reads file ' + fwd_filename) # make sure fasta file isn't empty min_fasta_len = 1 if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len): raise ValueError('Reads Library is empty in filename: ' + str(fwd_filename)) else: raise ValueError("No type set for input library " + str(input_item['name']) + " (" + str(input_item['ref']) + ")") # # Subsample # if subsample_percent == 100: replicate_input = [input_item] else: replicate_input = self._randomly_subsample_reads( input_item, subsample_percent=subsample_percent, subsample_replicates=subsample_replicates, subsample_seed=subsample_seed) # free up disk os.remove(input_item['fwd_file']) if input_item['type'] == self.PE_flag: os.remove(input_item['rev_file']) # return input file info #staged_input['input_dir'] = input_dir #staged_input['folder_suffix'] = suffix staged_input['replicate_input'] = replicate_input return staged_input def _randomly_subsample_reads(self, input_item=None, subsample_percent=100, subsample_replicates=1, subsample_seed=1): replicate_files = [] split_num = subsample_replicates # for now can only do percentage instead of raw cnt of reads per subsample use_reads_num = False use_reads_perc = True reads_num = 0 # not used. subsample_percent used instead # init randomizer random.seed(subsample_seed) # Paired End # if input_item['type'] == self.PE_flag: print("SUBSAMPLING PE library " + input_item['name']) # DEBUG # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) input_rev_path = re.sub("\.fastq$", "", input_item['rev_file']) input_rev_path = re.sub("\.FASTQ$", "", input_rev_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" output_rev_paired_file_path_base = input_rev_path + "_rev_paired" # set up for file io total_paired_reads = 0 total_unpaired_fwd_reads = 0 total_unpaired_rev_reads = 0 total_paired_reads_by_set = [] fwd_ids = dict() paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 1000000 # read fwd file to get fwd ids # rec_cnt = 0 # DEBUG print("GETTING IDS") # DEBUG with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) fwd_ids[read_id] = True # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 # read reverse to determine paired print("DETERMINING PAIRED IDS") # DEBUG with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if fwd_ids[read_id]: paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL PAIRED READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # split fwd paired print("WRITING FWD SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 total_paired_reads_by_set[lib_i] += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_fwd_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " FWD recs processed") # split rev paired print("WRITING REV SPLIT PAIRED") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_rev_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) rec_buf = [] last_read_id = None paired_cnt = 0 capture_type_paired = False with open(input_item['rev_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") if last_read_id != None: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id try: found = paired_lib_i[read_id] capture_type_paired = True except: total_unpaired_rev_reads += 1 capture_type_paired = False rec_buf.append(line) # last rec if len(rec_buf) > 0: if capture_type_paired: lib_i = paired_lib_i[last_read_id] paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") else: #unpaired_fwd_buf.extend(rec_buf) pass rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() print("\t" + str(paired_cnt) + " REV recs processed") # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n" report += "TOTAL UNPAIRED FWD READS (discarded): " + str( total_unpaired_fwd_reads) + "\n" report += "TOTAL UNPAIRED REV READS (discarded): " + str( total_unpaired_rev_reads) + "\n" report += "\n" for lib_i in range(split_num): report += "PAIRED READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return # for replicate_i,replicate_item in enumerate(replicate_files): # replicate_input.append({'fwd_file': replicate_item['fwd_file'], # 'type': input_item['type'], # 'name': input_item['name']+"-"+str(replicate_i) # }) # if input_item['type'] == self.PE_flag: # replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file'] print("MAKING REPLICATE OBJECT") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0 \ or not os.path.isfile (output_rev_paired_file_path) \ or os.path.getsize (output_rev_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'rev_file': output_rev_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) # SingleEndLibrary # elif input_item['type'] == self.SE_flag: print("SUBSAMPLING SE library " + input_item['name']) # file paths input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file']) input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path) output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired" # get "paired" ids print("DETERMINING IDS") # DEBUG paired_ids = dict() paired_ids_list = [] paired_lib_i = dict() paired_buf_size = 100000 recs_beep_n = 100000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) if read_id in paired_ids: raise ValueError("repeat read_id: " + read_id) paired_ids[read_id] = True paired_ids_list.append(read_id) # DEBUG # if rec_cnt % 100 == 0: # print ("read_id: '"+str(read_id)+"'") # rec_cnt += 1 total_paired_reads = len(paired_ids_list) print("TOTAL READS CNT: " + str(total_paired_reads)) # DEBUG # Determine sublibrary sizes if use_reads_num: reads_per_lib = reads_num if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_num <= total_paired_reads_cnt / split_num. You have reads_num:" + str(reads_num) + " > total_paired_reads_cnt:" + str(total_paired_reads) + " / split_num:" + str(split_num) + ". Instead try reads_num <= " + str(total_paired_reads // split_num)) elif use_reads_perc: reads_per_lib = int( (subsample_percent / 100.0) * total_paired_reads) if reads_per_lib > total_paired_reads // split_num: raise ValueError( "must specify reads_perc <= 1 / split_num. You have reads_perc:" + str(subsample_percent) + " > 1 / split_num:" + str(split_num) + ". Instead try reads_perc <= " + str(int(100 * 1 / split_num))) else: raise ValueError( "error in logic reads_num vs. reads_perc logic") # Determine random membership in each sublibrary print("GETTING RANDOM SUBSAMPLES") # DEBUG for i, read_id in enumerate( random.sample(paired_ids_list, reads_per_lib * split_num)): lib_i = i % split_num paired_lib_i[read_id] = lib_i # set up for file io total_paired_reads = 0 total_paired_reads_by_set = [] paired_buf_size = 1000000 # split reads print("WRITING SPLIT SINGLE END READS") # DEBUG paired_output_reads_file_handles = [] for lib_i in range(split_num): paired_output_reads_file_handles.append( open( output_fwd_paired_file_path_base + "-" + str(lib_i) + ".fastq", 'w', paired_buf_size)) total_paired_reads_by_set.append(0) rec_buf = [] last_read_id = None paired_cnt = 0 recs_beep_n = 1000000 with open(input_item['fwd_file'], 'r', 0) as input_reads_file_handle: rec_line_i = -1 for line in input_reads_file_handle: rec_line_i += 1 if rec_line_i == 3: rec_line_i = -1 elif rec_line_i == 0: if not line.startswith('@'): raise ValueError("badly formatted rec line: '" + line + "'") total_paired_reads += 1 if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[ lib_i].writelines(rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] read_id = line.rstrip('\n') read_id = re.sub("[ \t]+.*$", "", read_id) read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$", "", read_id) last_read_id = read_id rec_buf.append(line) # last rec if len(rec_buf) > 0: if last_read_id != None: try: lib_i = paired_lib_i[last_read_id] total_paired_reads_by_set[lib_i] += 1 paired_output_reads_file_handles[lib_i].writelines( rec_buf) paired_cnt += 1 except: pass if paired_cnt != 0 and paired_cnt % recs_beep_n == 0: print("\t" + str(paired_cnt) + " recs processed") rec_buf = [] for output_handle in paired_output_reads_file_handles: output_handle.close() # summary report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[ 'name'] + "\n" report += "TOTAL READS: " + str(total_paired_reads) + "\n" for lib_i in range(split_num): report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str( total_paired_reads_by_set[lib_i]) + "\n" print(report) # make replicate objects to return print("MAKING REPLICATE OBJECTS") # DEBUG paired_obj_refs = [] for lib_i in range(split_num): output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str( lib_i) + ".fastq" if not os.path.isfile (output_fwd_paired_file_path) \ or os.path.getsize (output_fwd_paired_file_path) == 0: raise ValueError("failed to create paired output") else: zero_pad = '0' * (len(str(split_num)) - len(str(lib_i + 1))) replicate_files.append({ 'fwd_file': output_fwd_paired_file_path, 'ref': input_item[ 'ref'], # note: this is for the src, not the subsample which is not saved 'type': input_item['type'], 'name': input_item['name'] + '-' + zero_pad + str(lib_i + 1) }) else: raise ValueError("unknown ReadLibrary type:" + str(input_item['type']) + " for readslibrary: " + input_item['name']) return replicate_files def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1): ''' counts the number of non-header, non-whitespace characters in a FASTA file ''' seq_len = 0 with open(fasta_path, 'r', 0) as fasta_handle: for line in fasta_handle: line = line.strip() if line.startswith('>'): continue line = line.replace(' ', '') seq_len += len(line) if seq_len >= min_fasta_len: return True return False
def fastqutils_stats(self, ctx, params): """ :param params: instance of type "FastqUtilsStatsParams" -> structure: parameter "workspace_name" of type "workspace_name" (A string representing a workspace name.), parameter "read_library_ref" of type "read_library_ref" (A string representing a ContigSet id.) :returns: instance of type "FastqUtilsStatsResult" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN fastqutils_stats print('Running fastqutils_stats with params=') print(pformat(params)) if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') # Get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = {'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL, token=ctx['token']) reads = ru.download_reads(reads_params)['files'] files = [reads[input_ref]['files']['fwd']] if reads[input_ref]['files']['rev']: files.append(reads[input_ref]['files']['rev']) print('running on files:') for f in files: print(f) # construct the command stats_cmd = [self.FASTQUTILS, 'stats'] report = '' for f in files: cmd = stats_cmd cmd.append(f) report += '============== ' + f + ' ==============\n' print('running: ' + ' '.join(cmd)) p = subprocess.Popen(cmd, cwd=self.scratch, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) while True: line = p.stdout.readline() if not line: break report += line print(line.replace('\n', '')) p.stdout.close() p.wait() report += "\n\n" print('return code: ' + str(p.returncode)) if p.returncode != 0: raise ValueError('Error running ' + self.FASTQUTILS + ', return code: ' + str(p.returncode)) reportObj = { 'objects_created': [], 'text_message': report } report = KBaseReport(self.callbackURL) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) returnVal = {'report_name': report_info['name'], 'report_ref': report_info['ref']} #END fastqutils_stats # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method fastqutils_stats return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def run_SPAdes(self, ctx, params): """ Run SPAdes on paired end libraries :param params: instance of type "SPAdesParams" (Input parameters for running SPAdes. workspace_name - the name of the workspace from which to take input and store output. output_contigset_name - the name of the output contigset list<paired_end_lib> read_libraries - Illumina PairedEndLibrary files to assemble. dna_source - (optional) the source of the DNA used for sequencing 'single_cell': DNA amplified from a single cell via MDA anything else: Standard DNA sample from multiple cells. Default value is None. min_contig_length - (optional) integer to filter out contigs with length < min_contig_length from the SPAdes output. Default value is 0 implying no filter.) -> structure: parameter "workspace_name" of String, parameter "output_contigset_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "dna_source" of String, parameter "min_contig_length" of Long :returns: instance of type "SPAdesOutput" (Output parameters for SPAdes run. report_name - the name of the KBaseReport.Report workspace object. report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_SPAdes # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_SPAdes with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=ctx['token']) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) phred_type = self.check_reads(params, reads, reftoname) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] # print ("REF:" + str(ref)) # print ("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'paired': reads_data.append({ 'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'single': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech }) else: raise ValueError('Something is very wrong with read lib' + reads_name) spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE], reads_data, phred_type) self.log('SPAdes output dir: ' + spades_out) # parse the output and save back to KBase output_contigs = os.path.join(spades_out, 'scaffolds.fasta') self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release') if params.get('min_contig_length', 0) > 0: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': params['min_contig_length'] }) # load report from scaffolds.fasta.filtered.fa report_name, report_ref = self.load_report( output_contigs + '.filtered.fa', params, wsname) else: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report(output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref} #END run_SPAdes # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_SPAdes return value ' + 'output is not type dict as required.') # return the results return [output]
def execReadLibraryPRINSEQ(self, ctx, input_params): """ :param input_params: instance of type "inputPRINSEQ" (execPRINSEQ and execReadLibraryPRINSEQ input input_reads_ref : may be KBaseFile.PairedEndLibrary or KBaseFile.SingleEndLibrary output_ws : workspace to write to output_reads_name : obj_name to create lc_method : Low complexity method - value must be "dust" or "entropy" lc_entropy_threshold : Low complexity threshold - Value must be an integer between 0 and 100. Note a higher lc_entropy_threshold in entropy is more stringent. lc_dust_threshold : Low complexity threshold - Value must be an integer between 0 and 100. Note a lower lc_entropy_threshold is less stringent with dust) -> structure: parameter "input_reads_ref" of type "data_obj_ref", parameter "output_ws" of type "workspace_name" (Common Types), parameter "output_reads_name" of type "data_obj_name", parameter "lc_method" of String, parameter "lc_entropy_threshold" of Long, parameter "lc_dust_threshold" of Long :returns: instance of type "outputReadLibraryExecPRINSEQ" -> structure: parameter "output_filtered_ref" of type "data_obj_ref", parameter "output_unpaired_fwd_ref" of type "data_obj_ref", parameter "output_unpaired_rev_ref" of type "data_obj_ref", parameter "report" of String """ # ctx is the context object # return variables are: output #BEGIN execReadLibraryPRINSEQ console = [] # self.log(console, 'Running execTrimmomatic with parameters: ') # self.log(console, "\n"+pformat(input_params)) report = '' returnVal = dict() # retVal['output_filtered_ref'] = None # retVal['output_unpaired_fwd_ref'] = None # retVal['output_unpaired_rev_ref'] = None token = ctx['token'] wsClient = workspaceService(self.ws_url, token=token) env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # param checks required_params = ['input_reads_ref', 'output_ws', 'lc_method'] # output reads_name is optional. If not set will use old_objects name for required_param in required_params: if required_param not in input_params or input_params[ required_param] is None: raise ValueError("Must define required param: '" + required_param + "'") if (input_params['lc_method'] != 'dust') and (input_params['lc_method'] != 'entropy'): raise ValueError( "lc_method (low complexity method) must be 'dust' or 'entropy', " + "it is currently set to : " + input_params['lc_method']) if not ('lc_entropy_threshold' in input_params or 'lc_dust_threshold' in input_params): raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) elif input_params['lc_method'] == 'dust': if 'lc_dust_threshold' not in input_params: raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) else: lc_threshold = input_params['lc_dust_threshold'] else: if 'lc_entropy_threshold' not in input_params: raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) else: lc_threshold = input_params['lc_entropy_threshold'] if (lc_threshold < 0.0) or (lc_threshold > 100.0): raise ValueError(( "The threshold for {} must be between 0 and 100, it is currently " + "set to : {}").format(input_params['lc_method'], lc_threshold)) reportObj = {'objects_created': [], 'text_message': ''} # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [ str(input_params['input_reads_ref']) ] # GET THE READS OBJECT # Determine whether read library or read set is input object # try: # object_info tuple [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_params['input_reads_ref'] }]})[0] input_reads_obj_type = input_reads_obj_info[TYPE_I] # input_reads_obj_version = input_reads_obj_info[VERSION_I] # this is object version, not type version except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_params['input_reads_ref']) + ')' + str(e)) # self.log (console, "B4 TYPE: '" + # str(input_reads_obj_type) + # "' VERSION: '" + str(input_reads_obj_version)+"'") # remove trailing version input_reads_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", input_reads_obj_type) # self.log (console, "AF TYPE: '"+str(input_reads_obj_type)+"' VERSION: '" + # str(input_reads_obj_version)+"'") # maybe add below later "KBaseSets.ReadsSet", acceptable_types = [ "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary", "KBaseFile.SingleEndLibrary" ] if input_reads_obj_type not in acceptable_types: raise ValueError("Input reads of type: '" + input_reads_obj_type + "'. Must be one of " + ", ".join(acceptable_types)) if input_reads_obj_type in [ "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary" ]: read_type = 'PE' elif input_reads_obj_type in [ "KBaseFile.SingleEndLibrary", "KBaseAssembly.SingleEndLibrary" ]: read_type = 'SE' # Instatiate ReadsUtils try: readsUtils_Client = ReadsUtils(url=self.callback_url, token=ctx['token']) # SDK local self._log(None, 'Starting Read File(s) Download') readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_params['input_reads_ref']], 'interleaved': 'false' }) self._log(None, 'Completed Read File(s) Downloading') except Exception as e: raise ValueError( ('Unable to get read library object from workspace: ({})\n' ).format(str(input_params['input_reads_ref']), str(e))) # get WS metadata to get obj_name ws = workspaceService(self.ws_url) try: info = ws.get_object_info_new( {'objects': [{ 'ref': input_params['input_reads_ref'] }]})[0] except workspaceService as wse: self._log(console, 'Logging workspace exception') self._log(str(wse)) raise #determine new object base name new_object_name = info[1] if ('output_reads_name' in input_params and input_params['output_reads_name'] != '' and input_params['output_reads_name'] is not None): new_object_name = input_params['output_reads_name'] # MAKE A DIRECTORY TO PUT THE READ FILE(S) # create the output directory and move the file there # PUT FILES INTO THE DIRECTORY # Sanitize the file names tempdir = tempfile.mkdtemp(dir=self.scratch) export_dir = os.path.join(tempdir, info[1]) os.makedirs(export_dir) if read_type == 'PE': # IF PAIRED END, potentially 6 files created # one of each for the two directions(good(paired), good_singletons, bad) # Take the good paired and (re)upload new reads object. # We throwout the bad reads input_files_info = self._setup_pe_files(readsLibrary, export_dir, input_params) # RUN PRINSEQ with user options (lc_method and lc_threshold) cmd = ( "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} " "-fastq2 {} -out_format 3 -lc_method {} " "-lc_threshold {}").format( input_files_info["fastq_file_path"], input_files_info["fastq2_file_path"], input_params['lc_method'], lc_threshold) print "Command to be run : " + cmd args = shlex.split(cmd) perl_script = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = perl_script.communicate() found_results = False file_names_dict = dict() for element in output: if "Input and filter stats:" in element: found_results = True element_parts = element.split("Input and filter stats:") # PRINSEQ OUTPUT report = "Input and filter stats:{}".format( element_parts[1]) reportObj['text_message'] = report read_files_list = os.listdir(export_dir) # proc = subprocess.Popen(['ls', '-l', export_dir], stdout=subprocess.PIPE) # proc_output = proc.stdout.read() # print "PROC OUTPUT : " + proc_output for read_filename in read_files_list: file_direction = None print "Read File : {}".format(read_filename) # determine if forward(fastq) or reverse(fastq2) file if input_files_info["fastq_filename"] in read_filename: file_direction = "fwd" elif input_files_info[ "fastq2_filename"] in read_filename: file_direction = "rev" if file_direction is not None: # determine good singleton or good part of a pair. print "TEST: {}_prinseq_good_".format( input_files_info["fastq_filename"]) if ("{}_prinseq_good_singletons".format( input_files_info["fastq_filename"]) in read_filename or "{}_prinseq_good_singletons".format( input_files_info["fastq2_filename"]) in read_filename): # Unpaired singletons that need to be # saved as a new single end reads object file_names_dict["{}_good_singletons".format(file_direction)] = \ os.path.join(export_dir, read_filename) elif ("{}_prinseq_good_".format( input_files_info["fastq_filename"]) in read_filename or "{}_prinseq_good_".format( input_files_info["fastq2_filename"]) in read_filename): file_names_dict["{}_good_pair".format(file_direction)] = \ os.path.join(export_dir, read_filename) if (('fwd_good_pair' in file_names_dict) and ('rev_good_pair' in file_names_dict)): self._log(None, 'Saving new Paired End Reads') returnVal['filtered_paired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': new_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['fwd_good_pair'], 'rev_file': file_names_dict['rev_good_pair'] } )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['filtered_paired_end_ref'], 'description': 'Filtered Paired End Reads', 'object_name': new_object_name }) print "REFERENCE : " + str( returnVal['filtered_paired_end_ref']) else: reportObj['text_message'] += \ "\n\nNo good matching pairs passed low complexity filtering.\n" + \ "Consider loosening the threshold value.\n" if 'fwd_good_singletons' in file_names_dict: self._log(None, 'Saving new Forward Unpaired Reads') fwd_object_name = "{}_fwd_singletons".format( new_object_name) returnVal['output_filtered_fwd_unpaired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': fwd_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['fwd_good_singletons']} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_fwd_unpaired_end_ref'], 'description': 'Filtered Forward Unpaired End Reads', 'object_name': fwd_object_name }) print "REFERENCE : " + \ str(returnVal['output_filtered_fwd_unpaired_end_ref']) if 'rev_good_singletons' in file_names_dict: self._log(None, 'Saving new Reverse Unpaired Reads') rev_object_name = "{}_rev_singletons".format( new_object_name) returnVal['output_filtered_rev_unpaired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': rev_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['rev_good_singletons']} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_rev_unpaired_end_ref'], 'description': 'Filtered Reverse Unpaired End Reads', 'object_name': rev_object_name }) print "REFERENCE : " + \ str(returnVal['output_filtered_rev_unpaired_end_ref']) if len(reportObj['objects_created']) > 0: reportObj['text_message'] += "\nOBJECTS CREATED :\n" for obj in reportObj['objects_created']: reportObj['text_message'] += "{} : {}\n".format( obj['object_name'], obj['description']) else: reportObj['text_message'] += \ "\nFiltering filtered out all reads. No objects made.\n" if not found_results: raise Exception('Unable to execute PRINSEQ, Error: {}'.format( str(output))) print "FILES DICT : {}".format(str(file_names_dict)) print "REPORT OBJECT :" print str(reportObj) elif read_type == 'SE': # Download reads Libs to FASTQ files # IF SINGLE END INPUT 2 files created (good and bad) # Take good and (re)upload new reads object input_fwd_file_path = \ readsLibrary['files'][input_params['input_reads_ref']]['files']['fwd'] fastq_filename = self._sanitize_file_name( os.path.basename(input_fwd_file_path)) fastq_file_path = os.path.join(export_dir, fastq_filename) shutil.move(input_fwd_file_path, fastq_file_path) # RUN PRINSEQ with user options (lc_method and lc_threshold) cmd = ( "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} " "-out_format 3 -lc_method {} " "-lc_threshold {}").format(fastq_file_path, input_params['lc_method'], lc_threshold) print "Command to be run : " + cmd args = shlex.split(cmd) print "ARGS: " + str(args) perl_script = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = perl_script.communicate() print "OUTPUT: " + str(output) found_results = False found_se_filtered_file = False file_names_dict = dict() for element in output: if "Input and filter stats:" in element: found_results = True element_parts = element.split("Input and filter stats:") # PRINSEQ OUTPUT report = "Input and filter stats:{}".format( element_parts[1]) reportObj['text_message'] = report read_files_list = os.listdir(export_dir) for read_filename in read_files_list: print "Early Read File : {}".format(read_filename) for read_filename in read_files_list: print "Read File : {}".format(read_filename) if ("{}_prinseq_good_".format(fastq_filename) in read_filename): #Found Good file. Save the Reads objects self._log(None, 'Saving Filtered Single End Reads') returnVal['output_filtered_single_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': new_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': os.path.join(export_dir, read_filename)} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_single_end_ref'], 'description': 'Filtered Single End Reads' }) print "REFERENCE : " + str( returnVal['output_filtered_single_end_ref']) found_se_filtered_file = True break if not found_se_filtered_file: reportObj['text_message'] += \ "\n\nNone of the reads passed low complexity filtering.\n" + \ "Consider loosening the threshold value.\n" if not found_results: raise Exception('Unable to execute PRINSEQ, Error: {}'.format( str(output))) print "FILES DICT : {}".format(str(file_names_dict)) print "REPORT OBJECT :" print str(reportObj) # save report object # report = KBaseReport(self.callback_url, token=ctx['token']) #report = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER) report_info = report.create({ 'report': reportObj, 'workspace_name': input_params['output_ws'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END execReadLibraryPRINSEQ # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method execReadLibraryPRINSEQ return value ' + 'output is not type dict as required.') # return the results return [output]
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 min_k - minimum kmer size (<= 127), must be odd number, default 21 max_k - maximum kmer size (<= 127), must be odd number, default 99 k_step - increment of kmer size of each iteration (<= 28), must be even number, default 10 k_list - list of kmer size (all must be odd, in the range 15-127, increment <= 28); override `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = { 'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError( 'min_contig_length parameter must be a non-negative integer' ) megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # set the number of cpus megahit_cmd.append('--num-cpu-threads') megahit_cmd.append(str(multiprocessing.cpu_count() - 1)) # set the output location timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: raise ValueError('Error running MEGAHIT, return code: ' + str(retcode) + '\n') output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({ 'files': [{ 'path': output_contigs, 'label': params['output_contigset_name'] }] }) except QUASTError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report({ 'message': report, 'objects_created': [{ 'ref': output_data_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except _RepError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def runFastQC(self, ctx, input_params): """ :param input_params: instance of type "FastQCParams" -> structure: parameter "input_ws" of String, parameter "input_file" of String, parameter "input_file_ref" of String :returns: instance of type "FastQCOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: reported_output #BEGIN runFastQC token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) uuid_string = str(uuid.uuid4()) read_file_path = self.scratch+"/"+uuid_string os.mkdir(read_file_path) input_file_ref = self._get_input_file_ref_from_params(input_params) library=None try: library = wsClient.get_objects2({'objects': [{'ref': input_file_ref}]})['data'][0] except Exception as e: raise ValueError('Unable to get read library object from workspace: (' + input_file_ref + ')' + str(e)) download_read_params = {'read_libraries': [], 'interleaved':"false"} if("SingleEnd" in library['info'][2] or "PairedEnd" in library['info'][2]): download_read_params['read_libraries'].append(library['info'][7]+"/"+library['info'][1]) elif("SampleSet" in library['info'][2]): for sample_id in library['data']['sample_ids']: if("/" in sample_id): download_read_params['read_libraries'].append(sample_id) else: if(sample_id.isdigit()): download_read_params['read_libraries'].append(library['info'][6]+"/"+sample_id) else: download_read_params['read_libraries'].append(library['info'][7]+"/"+sample_id) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) ret = ru.download_reads(download_read_params) read_file_list=list() for file in ret['files']: files = ret['files'][file]['files'] fwd_name=files['fwd'].split('/')[-1] fwd_name=fwd_name.replace('.gz','') shutil.move(files['fwd'],os.path.join(read_file_path, fwd_name)) read_file_list.append(os.path.join(read_file_path, fwd_name)) if(files['rev'] is not None): rev_name=files['rev'].split('/')[-1] rev_name=rev_name.replace('.gz','') shutil.move(files['rev'],os.path.join(read_file_path, rev_name)) read_file_list.append(os.path.join(read_file_path, rev_name)) subprocess.check_output(["fastqc"]+read_file_list) # report = "Command run: "+" ".join(["fastqc"]+read_file_list) output = self.create_report(token, input_params['input_ws'], uuid_string, read_file_path) reported_output = {'report_name': output['name'], 'report_ref': output['ref']} # Remove temp reads directory shutil.rmtree(read_file_path, ignore_errors=True) #END runFastQC # At some point might do deeper type checking... if not isinstance(reported_output, dict): raise ValueError('Method runFastQC return value ' + 'reported_output is not type dict as required.') # return the results return [reported_output]
def run_MiniASM(self, ctx, params): """ Run MiniASM on paired end libraries :param params: instance of type "MiniASM_Params" -> structure: parameter "workspace_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "output_contigset_name" of String, parameter "min_contig" of Long, parameter "opt_args" of type "opt_args_type" (Input parameters for running MiniASM. string workspace_name - the name of the workspace from which to take input and store output. list<paired_end_lib> read_libraries - Illumina PairedEndLibrary files to assemble. string output_contigset_name - the name of the output contigset) -> structure: parameter "min_span" of Long, parameter "min_coverage" of Long, parameter "min_overlap" of Long, parameter "extra_params" of list of String :returns: instance of type "MiniASM_Output" (Output parameters for MiniASM run. string report_name - the name of the KBaseReport.Report workspace object. string report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_MiniASM print("=================== IN run_MiniASM") # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_MiniASM with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] print("Workspace name: " + wsname) obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] print ("REF:" + str(ref)) print ("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({'fwd_file': f['fwd'], 'type':'paired', 'seq_tech': seq_tech}) elif f['type'] == 'paired': reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type':'paired', 'seq_tech': seq_tech}) elif f['type'] == 'single': reads_data.append({'fwd_file': f['fwd'], 'type':'single', 'seq_tech': seq_tech}) else: raise ValueError('Something is very wrong with read lib' + reads_name) print("READS_DATA: ") pprint(reads_data) print("============================ END OF READS_DATA: ") # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) outdir = os.path.join(self.scratch, 'MiniASM_dir-' + str(timestamp)) miniasm_outfile = self.exec_MiniASM(reads_data, params, outdir) self.log('MiniASM output dir: ' + miniasm_outfile) # parse the output and save back to KBase output_contigs = miniasm_outfile min_contig_len = 0 if self.PARAM_IN_MIN_CONTIG in params and params[self.PARAM_IN_MIN_CONTIG] is not None: if (int(params[self.PARAM_IN_MIN_CONTIG])) > 0: min_contig_len = int(params[self.PARAM_IN_MIN_CONTIG]) self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL) assemblyUtil.save_assembly_from_fasta({'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': min_contig_len }) report_name, report_ref = self.load_report(output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref } #END run_MiniASM # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_MiniASM return value ' + 'output is not type dict as required.') # return the results return [output]