def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info
def export_genome_features_protein_to_fasta(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_features_protein_to_fasta print('export_genome_features_protein_to_fasta -- paramaters = ') # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot run export_genome_features_protein_to_fasta - no "input_ref" field defined.' ) # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_object_info_new({ 'objects': [{ 'ref': params['input_ref'] }], 'includeMetadata': 0, 'ignoreErrors': 0 })[0] genome_to_protein_fasta_params = {'genome_ref': params['input_ref']} # export to file (building from KBase Genome Object) result = self.genome_to_genbank( ctx, genome_to_protein_fasta_params)[0]['genbank_file'] #END export_genome_features_protein_to_fasta # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError( 'Method export_genome_features_protein_to_fasta return value ' + 'output is not type dict as required.') # return the results return [output]
class SPAdesUtils: """ Define the SPAdesUtils functions """ SPADES_VERSION = '3.13.0' SPADES_BIN = '/opt/SPAdes-' + SPADES_VERSION + '-Linux/bin' DISABLE_SPADES_OUTPUT = False # should be False in production # Basic options PARAM_IN_SINGLE_CELL = 'single_cell' # --sc PARAM_IN_METAGENOME = 'metagenomic' # --meta PARAM_IN_PLASMID = 'plasmid' # --plasmid PARAM_IN_RNA = 'rna' # --rna PARAM_IN_IONTORRENT = 'iontorrent' # --iontorrent # Pipeline options PARAM_IN_ONLY_ERROR_CORR = 'only-error-correction' # --only-error-correction PARAM_IN_ONLY_ASSEMBLER = 'only-assembler' # --only-assembler PARAM_IN_CAREFUL = 'careful' # --careful PARAM_IN_CONTINUE = 'continue' # --continue PARAM_IN_DISABLE_GZIP = 'disable-gzip-output' # --disable-gzip-output # Input parameters PARAM_IN_WS = 'workspace_name' PARAM_IN_CS_NAME = 'output_contigset_name' PARAM_IN_READS = 'reads_libraries' PARAM_IN_LONG_READS = 'long_reads_libraries' PARAM_IN_KMER_SIZES = 'kmer_sizes' PARAM_IN_SKIP_ERR_CORRECT = 'skip_error_correction' PARAM_IN_MIN_CONTIG_LENGTH = 'min_contig_length' PARAM_IN_DNA_SOURCE = 'dna_source' PARAM_IN_PIPELINE_OPTION = 'pipeline_options' ASSEMBLE_RESULTS_DIR = 'assemble_results' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') THREADS_PER_CORE = 3 MAX_THREADS = 64 # per email thread with Anton Korobeynikov MAX_THREADS_META = 128 # Increase threads for metagenomic assemblies MEMORY_OFFSET_GB = 1 # 1GB MIN_MEMORY_GB = 5 MAX_MEMORY_GB_SPADES = 500 MAX_MEMORY_GB_META_SPADES = 1000 GB = 1000000000 # private method definition def __init__(self, prj_dir, config): self.workspace_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] if 'shock-url' in config: self.shock_url = config['shock-url'] if 'handle-service-url' in config: self.handle_url = config['handle-service-url'] self.ws_client = Workspace(self.workspace_url, token=self.token) self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release') self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release') self.kbr = KBaseReport(self.callback_url) self.kbq = kb_quast(self.callback_url) self.proj_dir = prj_dir self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION'] def _get_kbreads_info(self, wsname, reads_refs): """ _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding reads info with as interleaved fastq files and returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'type': reads_type, # ('interleaved', 'paired', or 'single') 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, # only if paired end } """ obj_ids = [] for r in reads_refs: if r: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) if not obj_ids: return [] ws_info = self.ws_client.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = self.ru.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false' })['files'] except ServerError as se: log('logging stacktrace from dynamic client error') log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise # log('Downloaded reads data from KBase:\n' + pformat(reads)) reads_data = [] for ref in reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]['sequencing_tech'] rds_info = { 'fwd_file': f['fwd'], 'reads_ref': ref, 'type': f['type'], 'seq_tech': seq_tech, 'reads_name': reads_name } if f.get('rev', None): rds_info['rev_file'] = f['rev'] reads_data.append(rds_info) return reads_data def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.proj_dir, str(uuid.uuid4())) _mkdir_p(output_directory) spades_output = os.path.join(output_directory, 'spades_output.zip') self._zip_folder(out_dir, spades_output) output_files.append({'path': spades_output, 'name': os.path.basename(spades_output), 'label': os.path.basename(spades_output), 'description': 'Output file(s) generated by {}'.format( self.spades_version)}) return output_files def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) # print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print("{} created successfully.".format(output_path)) # with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _load_stats(self, input_file_name): log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly') log('Building Object.') if not os.path.isfile(input_file_name): raise Exception('The input file name {0} is not a file!'.format(input_file_name)) with open(input_file_name, 'r') as input_file_handle: contig_id = None sequence_len = 0 fasta_dict = dict() first_header_found = False # Pattern for replacing white space pattern = re.compile(r'\s+') for current_line in input_file_handle: if (current_line[0] == '>'): # found a header line # Wrap up previous fasta sequence if not first_header_found: first_header_found = True else: fasta_dict[contig_id] = sequence_len sequence_len = 0 fasta_header = current_line.replace('>', '').strip() try: contig_id = fasta_header.strip().split(' ', 1)[0] except (IndexError, ValueError, KeyError): contig_id = fasta_header.strip() else: sequence_len += len(re.sub(pattern, '', current_line)) # wrap up last fasta sequence if not first_header_found: raise Exception("There are no contigs in this file") else: fasta_dict[contig_id] = sequence_len return fasta_dict def _parse_single_reads(self, reads_type, reads_list): """ _parse_single_reads: given the reads_type and a list of reads, return an object defining the type and a list of fastq files. """ single_reads_fqs = [] ret_obj = {} if reads_list and isinstance(reads_list, list): for rds in reads_list: single_reads_fqs.append(rds['fwd_file']) if single_reads_fqs: ret_obj = { "type": reads_type, "single reads": single_reads_fqs } return ret_obj def _parse_pair_reads(self, reads_type, reads_list): """ _parse_pair_reads: given the reads_type and a list of reads, return an object defining the type and a list of fastq files. """ right_reads_fqs = [] left_reads_fqs = [] ret_obj = {} if reads_list and isinstance(reads_list, list): for rds in reads_list: right_reads_fqs.append(rds['fwd_file']) if rds.get('rev_file', None): left_reads_fqs.append(rds['rev_file']) orent = reads_list[0]['orientation'] if right_reads_fqs: ret_obj["right reads"] = right_reads_fqs ret_obj["orientation"] = orent ret_obj["type"] = reads_type if left_reads_fqs: ret_obj["left reads"] = left_reads_fqs return ret_obj # end of private methods # public method definitions def check_spades_params(self, params): """ check_spades_params: checks params passed to run_HybridSPAdes method and set default values """ # log('Start validating run_HybridSPAdes parameters:\n{}'.format( # json.dumps(params, indent=1))) # check for mandatory parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_WS)) if self.INVALID_WS_NAME_RE.search(params[self.PARAM_IN_WS]): raise ValueError('Invalid workspace name: {}.'.format(params[self.PARAM_IN_WS])) if params.get(self.PARAM_IN_CS_NAME, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_CS_NAME)) if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]): raise ValueError('Invalid workspace object name: {}.'.format( params[self.PARAM_IN_CS_NAME])) if params.get(self.PARAM_IN_READS, None) is None: raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_READS)) if type(params[self.PARAM_IN_READS]) != list: raise ValueError('Input reads {} must be a list.'.format(self.PARAM_IN_READS)) if len(params[self.PARAM_IN_READS]) == 0: raise ValueError('Input parameter {} should have at least one reads.'.format( self.PARAM_IN_READS)) if self.PARAM_IN_MIN_CONTIG_LENGTH in params: if not isinstance(params[self.PARAM_IN_MIN_CONTIG_LENGTH], int): raise ValueError('{} must be of type int.'.format(self.PARAM_IN_MIN_CONTIG_LENGTH)) if not params.get(self.PARAM_IN_KMER_SIZES, None): params[self.PARAM_IN_KMER_SIZES] = [21, 33, 55] kmer_sstr = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES]) params[self.PARAM_IN_KMER_SIZES] = kmer_sstr print("KMER_SIZES: " + kmer_sstr) if params.get(self.PARAM_IN_SKIP_ERR_CORRECT, None): print("SKIP ERR CORRECTION: " + str(params[self.PARAM_IN_SKIP_ERR_CORRECT])) # check for basic option parameters if params.get(self.PARAM_IN_DNA_SOURCE, None): dna_src = params[self.PARAM_IN_DNA_SOURCE] if dna_src not in [self.PARAM_IN_SINGLE_CELL, self.PARAM_IN_METAGENOME, self.PARAM_IN_PLASMID, self.PARAM_IN_RNA, self.PARAM_IN_IONTORRENT]: params[self.PARAM_IN_DNA_SOURCE] = None else: params[self.PARAM_IN_DNA_SOURCE] = None # a list of basic options0 params['basic_options'] = ['-o', self.ASSEMBLE_RESULTS_DIR] dna_src = params.get(self.PARAM_IN_DNA_SOURCE) if dna_src == self.PARAM_IN_SINGLE_CELL: params['basic_options'].append('--sc') elif dna_src == self.PARAM_IN_METAGENOME: params['basic_options'].append('--meta') elif dna_src == self.PARAM_IN_PLASMID: params['basic_options'].append('--plasmid') elif dna_src == self.PARAM_IN_RNA: params['basic_options'].append('--rna') elif dna_src == self.PARAM_IN_IONTORRENT: params['basic_options'].append('--iontorrent') # processing pipeline option parameters if params.get(self.PARAM_IN_PIPELINE_OPTION, None): pipe_opts = params[self.PARAM_IN_PIPELINE_OPTION] opts = [self.PARAM_IN_ONLY_ERROR_CORR, self.PARAM_IN_ONLY_ASSEMBLER, self.PARAM_IN_CONTINUE, self.PARAM_IN_DISABLE_GZIP, self.PARAM_IN_CAREFUL] if any(elem in opts for elem in pipe_opts): pass else: params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL] else: params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL] if '--meta' in params['basic_options']: # you cannot specify --careful, --mismatch-correction # or --cov-cutoff in metagenomic mode! try: params[self.PARAM_IN_PIPELINE_OPTION].remove(self.PARAM_IN_CAREFUL) params[self.PARAM_IN_PIPELINE_OPTION].remove('mismatch-correction') params[self.PARAM_IN_PIPELINE_OPTION].remove('cov-cutoff') except ValueError: pass if params.get('create_report', None) is None: params['create_report'] = 0 return params def generate_report(self, fa_file_name, params, out_dir, wsname): """ Generating and saving report """ log('Generating and saving report') fa_file_with_path = os.path.join(out_dir, fa_file_name) fasta_stats = self._load_stats(fa_file_with_path) lengths = [fasta_stats[contig_id] for contig_id in fasta_stats] assembly_ref = wsname + '/' + params[self.PARAM_IN_CS_NAME] report_text = '' report_text += 'SPAdes results saved to: ' + wsname + '/' + out_dir + '\n' report_text += 'Assembly saved to: ' + assembly_ref + '\n' report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report_text += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n' for c in range(bins): report_text += (' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n') print('Running QUAST') quastret = self.kbq.run_QUAST( {'files': [{'path': fa_file_with_path, 'label': params[self.PARAM_IN_CS_NAME]}]}) output_files = self._generate_output_file_list(out_dir) print('Saving report') report_output = self.kbr.create_extended_report( {'message': report_text, 'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'file_links': output_files, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_spades_report_' + str(uuid.uuid4()), 'workspace_name': params[self.PARAM_IN_WS]}) return report_output['name'], report_output['ref'] def get_hybrid_reads_info(self, input_params): """ get_hybrid_reads_info--from a list of ReadsParams structures fetches the corresponding reads info with the ReadsParams[lib_ref] returns None or a tuple of nine reads data each is a list of the following structure: { 'fwd_file': path_to_fastq_file, 'orientation': (default value is "fr" (forward-reverse) for paired-end libraries "rf" (reverse-forward) for mate-pair libraries), None for others 'lib_type': ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"), 'type': reads_type, # 'interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file # only if paired end } OR: { 'fwd_file': path_to_fastq_file, 'long_reads_type': ("pacbio-ccs", "pacbio-clr", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"), 'type': reads_type, # 'interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience } """ rds_params = copy.deepcopy(input_params) if rds_params.get(self.PARAM_IN_READS, None) is None: return () # an empty tuple wsname = rds_params[self.PARAM_IN_WS] sgl_rds_data = [] # single pe_rds_data = [] # paired-end mp_rds_data = [] # mate-pairs pb_ccs_data = [] # pacbio-ccs pb_clr_data = [] # pacbio-clr np_rds_data = [] # nanopore sgr_rds_data = [] # sanger tr_ctg_data = [] # trusted-contigs ut_ctg_data = [] # untrusted-contigs # a list of Illumina or IonTorrent paired-end/high-quality mate-pairs/unpaired reads rds_refs = [] rds_libs = rds_params[self.PARAM_IN_READS] for rds_lib in rds_libs: if rds_lib.get('lib_ref', None): rds_refs.append(rds_lib['lib_ref']) kb_rds_data = self._get_kbreads_info(wsname, rds_refs) for rds_lib in rds_libs: for kb_d in kb_rds_data: if 'lib_ref' in rds_lib and rds_lib['lib_ref'] == kb_d['reads_ref']: if rds_lib['lib_type'] == 'single': # single end reads grouped params kb_d['orientation'] = None kb_d['lib_type'] = 'single' sgl_rds_data.append(kb_d) elif rds_lib['lib_type'] == 'paired-end': # pairedEnd reads grouped params kb_d['orientation'] = ('fr' if rds_lib.get('orientation', None) is None else rds_lib['orientation']) kb_d['lib_type'] = 'paired-end' pe_rds_data.append(kb_d) elif rds_lib['lib_type'] == 'mate-pairs': # mate-pairs reads grouped params kb_d['orientation'] = ('rf' if rds_lib.get('orientation', None) is None else rds_lib['orientation']) kb_d['lib_type'] = 'mate-pairs' mp_rds_data.append(kb_d) # a list of PacBio (CCS or CLR), Oxford Nanopore Sanger reads # and/or additional contigs long_rds_refs = [] if rds_params.get(self.PARAM_IN_LONG_READS, None): long_rds_libs = rds_params[self.PARAM_IN_LONG_READS] for lrds_lib in long_rds_libs: if lrds_lib.get('long_reads_ref', None): long_rds_refs.append(lrds_lib['long_reads_ref']) kb_lrds_data = self._get_kbreads_info(wsname, long_rds_refs) for lrds_lib in long_rds_libs: for kb_ld in kb_lrds_data: if ('long_reads_ref' in lrds_lib and lrds_lib['long_reads_ref'] == kb_ld['reads_ref']): if lrds_lib['long_reads_type'] == 'pacbio-ccs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] pb_ccs_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'pacbio-clr': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] pb_clr_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'nanopore': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] np_rds_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'sanger': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] sgr_rds_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'trusted-contigs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] tr_ctg_data.append(kb_ld) elif lrds_lib['long_reads_type'] == 'untrusted-contigs': kb_ld['long_reads_type'] = lrds_lib['long_reads_type'] ut_ctg_data.append(kb_ld) return (sgl_rds_data, pe_rds_data, mp_rds_data, pb_ccs_data, pb_clr_data, np_rds_data, sgr_rds_data, tr_ctg_data, ut_ctg_data) def construct_yaml_dataset_file(self, sgl_libs=None, pe_libs=None, mp_libs=None, pb_ccs=None, pb_clr=None, np_libs=None, sgr_libs=None, tr_ctgs=None, ut_ctgs=None): """ construct_yaml_dataset_file: Specifying input data with YAML data set file (advanced) An alternative way to specify an input data set for SPAdes is to create a YAML data set file. By using a YAML file you can provide an unlimited number of paired-end, mate-pair and unpaired libraries. Basically, YAML data set file is a text file, in which input libraries are provided as a comma-separated list in square brackets. Each library is provided in braces as a comma-separated list of attributes. The following attributes are available: - orientation ("fr", "rf", "ff") - type ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs") - interlaced reads (comma-separated list of files with interlaced reads) - left reads (comma-separated list of files with left reads) - right reads (comma-separated list of files with right reads) - single reads (comma-separated list of files with single reads or unpaired reads from paired library) - merged reads (comma-separated list of files with merged reads) To properly specify a library you should provide its type and at least one file with reads. For ONT, PacBio, Sanger and contig libraries you can provide only single reads. Orientation is an optional attribute. Its default value is "fr" (forward-reverse) for paired-end libraries and "rf" (reverse-forward) for mate-pair libraries. The value for each attribute is given after a colon. Comma-separated lists of files should be given in square brackets. For each file you should provide its full path in double quotes. Make sure that files with right reads are given in the same order as corresponding files with left reads. For example, if you have one paired-end library splitted into two pairs of files: lib_pe1_left_1.fastq lib_pe1_right_1.fastq lib_pe1_left_2.fastq lib_pe1_right_2.fastq one mate-pair library: lib_mp1_left.fastq lib_mp1_right.fastq and PacBio CCS and CLR reads: pacbio_ccs.fastq pacbio_clr.fastq YAML file should look like this: ------------------------------------------------ [ { orientation: "fr", type: "paired-end", right reads: [ "/FULL_PATH_TO_DATASET/lib_pe1_right_1.fastq", "/FULL_PATH_TO_DATASET/lib_pe1_right_2.fastq" ], left reads: [ "/FULL_PATH_TO_DATASET/lib_pe1_left_1.fastq", "/FULL_PATH_TO_DATASET/lib_pe1_left_2.fastq" ] }, { orientation: "rf", type: "mate-pairs", right reads: [ "/FULL_PATH_TO_DATASET/lib_mp1_right.fastq" ], left reads: [ "/FULL_PATH_TO_DATASET/lib_mp1_left.fastq" ] }, { type: "single", single reads: [ "/FULL_PATH_TO_DATASET/pacbio_ccs.fastq" ] }, { type: "pacbio", single reads: [ "/FULL_PATH_TO_DATASET/pacbio_clr.fastq" ] } ] ------------------------------------------------ Once you have created a YAML file save it with .yaml extension (e.g. as my_data_set.yaml) and run SPAdes using the --dataset option: e.g., <SPAdes_bin_dir>/spades.py --dataset <your YAML file> -o spades_output """ # STEP 1: get the working folder housing the .yaml file and the SPAdes results if not os.path.exists(self.proj_dir): os.makedirs(self.proj_dir) yaml_file_path = os.path.join(self.proj_dir, 'input_data_set.yaml') # STEP 2: construct and save the 'input_data_set.yaml' file # generate the object array input_data_set = [] if pe_libs: pair_libs = self._parse_pair_reads('paired-end', pe_libs) if pair_libs: input_data_set.append(pair_libs) if mp_libs: pair_libs = self._parse_pair_reads('mate-pairs', mp_libs) if pair_libs: input_data_set.append(pair_libs) # for reads_type = 'single' if sgl_libs: single_libs = self._parse_single_reads("single", sgl_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'pacbio-ccs', treated as type of 'single' if pb_ccs: single_libs = self._parse_single_reads("single", pb_ccs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'pacbio-clr' if pb_clr: single_libs = self._parse_single_reads("pacbio", pb_clr) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'nanopore' if np_libs: single_libs = self._parse_single_reads("nanopore", np_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'sanger' if sgr_libs: single_libs = self._parse_single_reads("sanger", sgr_libs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'trusted-contigs' if tr_ctgs: single_libs = self._parse_single_reads("trusted-contigs", tr_ctgs) if single_libs: input_data_set.append(single_libs) # for long_reads_type = 'untrusted-contigs' if ut_ctgs: single_libs = self._parse_single_reads("untrusted-contigs", ut_ctgs) if single_libs: input_data_set.append(single_libs) if input_data_set == []: print('Empty input data set!!') return '' pprint(input_data_set) try: with open(yaml_file_path, 'w') as yaml_file: json.dump(input_data_set, yaml_file) except IOError as ioerr: log('Creation of the {} file raised error:\n'.format(yaml_file_path)) pprint(ioerr) return '' else: return yaml_file_path def run_assemble(self, yaml_file, kmer_sizes, dna_source=None, basic_opts=None, pipeline_opts=['careful']): """ run_assemble: run the SPAdes assemble with given input parameters/options """ exit_code = 1 if not os.path.isfile(yaml_file): log("The input data set yaml file DOES NOT exist at {}\n".format(yaml_file)) return exit_code log("The input data set yaml file exists at {}\n".format(yaml_file)) yf_dir, yf_nm = os.path.split(yaml_file) mem = (psutil.virtual_memory().available / self.GB - self.MEMORY_OFFSET_GB) if mem < self.MIN_MEMORY_GB: raise ValueError( 'Only ' + str(psutil.virtual_memory().available) + ' bytes of memory are available. The SPAdes wrapper will' + ' not run without at least ' + str(self.MIN_MEMORY_GB + self.MEMORY_OFFSET_GB) + ' gigabytes available') if dna_source and dna_source == self.PARAM_IN_METAGENOME: max_mem = self.MAX_MEMORY_GB_META_SPADES max_threads = self.MAX_THREADS_META else: max_mem = self.MAX_MEMORY_GB_SPADES max_threads = self.MAX_THREADS threads = min(max_threads, psutil.cpu_count() * self.THREADS_PER_CORE) if mem > max_mem: mem = max_mem tmpdir = os.path.join(self.proj_dir, 'spades_tmp_dir') if not os.path.exists(tmpdir): os.makedirs(tmpdir) a_cmd = [os.path.join(self.SPADES_BIN, 'spades.py')] a_cmd += ['--threads', str(threads), '--memory', str(mem)] a_cmd += ['--tmp-dir', tmpdir] a_cmd += ['--dataset', yaml_file] if kmer_sizes is not None: a_cmd += ['-k ' + kmer_sizes] if basic_opts is None: basic_opts = ['-o', self.ASSEMBLE_RESULTS_DIR] if isinstance(basic_opts, list): a_cmd += basic_opts if pipeline_opts and isinstance(pipeline_opts, list): for p_opt in pipeline_opts: if p_opt == self.PARAM_IN_CAREFUL: a_cmd += ['--careful'] if p_opt == self.PARAM_IN_ONLY_ERROR_CORR: a_cmd += ['--only-error-correction'] if p_opt == self.PARAM_IN_ONLY_ASSEMBLER: a_cmd += ['--only-assembler'] if p_opt == self.PARAM_IN_CONTINUE: a_cmd += ['--continue'] if p_opt == self.PARAM_IN_DISABLE_GZIP: a_cmd += ['--disable-gzip-output'] # Last check of command options before the call if '--meta' in a_cmd: # you cannot specify --careful, --mismatch-correction # or --cov-cutoff in metagenomic mode! try: a_cmd.remove(self.PARAM_IN_CAREFUL) a_cmd.remove('mismatch-correction') a_cmd.remove('cov-cutoff') except ValueError: pass log("**************The HybridSPAdes assembling command is:\n{}".format(' '.join(a_cmd))) assemble_out_dir = os.path.join(self.proj_dir, self.ASSEMBLE_RESULTS_DIR) if not os.path.exists(assemble_out_dir): os.makedirs(assemble_out_dir) p = subprocess.Popen(a_cmd, cwd=yf_dir, shell=False) exit_code = p.wait() log('Return code: ' + str(exit_code)) if p.returncode != 0: raise ValueError('Error running spades.py, return code: ' + str(p.returncode) + '\n') else: exit_code = p.returncode return exit_code def save_assembly(self, fa_file_path, wsname, a_name, min_ctg_length=0): """ save_assembly: save the assembly to KBase workspace """ if os.path.isfile(fa_file_path): log('Uploading FASTA file to Assembly...') if min_ctg_length > 0: self.au.save_assembly_from_fasta( {'file': {'path': fa_file_path}, 'workspace_name': wsname, 'assembly_name': a_name, 'min_contig_length': min_ctg_length}) else: self.au.save_assembly_from_fasta( {'file': {'path': fa_file_path}, 'workspace_name': wsname, 'assembly_name': a_name}) else: log("The resulting sequence file {} is not found.".format(fa_file_path))
class NarrativeManager: KB_CELL = 'kb-cell' KB_TYPE = 'type' KB_APP_CELL = 'kb_app' KB_FUNCTION_CELL = 'function_input' KB_OUTPUT_CELL = 'function_output' KB_ERROR_CELL = 'kb_error' KB_CODE_CELL = 'kb_code' KB_STATE = 'widget_state' DEBUG = False DATA_PALETTES_TYPES = DataPaletteTypes(False) def __init__(self, config, ctx, set_api_client, data_palette_client): self.narrativeMethodStoreURL = config['narrative-method-store'] self.set_api_cache = set_api_client # DynamicServiceCache type self.data_palette_client = data_palette_client # DynamicServiceCache type self.token = ctx["token"] self.user_id = ctx["user_id"] self.ws = Workspace(config['workspace-url'], token=self.token) self.intro_md_file = config['intro-markdown-file'] # We switch DPs on only for internal Continuous Integration environment for now: if config['kbase-endpoint'].startswith("https://ci.kbase.us/") or \ 'USE_DP' in os.environ: self.DATA_PALETTES_TYPES = DataPaletteTypes(True) def list_objects_with_sets(self, ws_id=None, ws_name=None, workspaces=None, types=None, include_metadata=0, include_data_palettes=0): if not workspaces: if not ws_id and not ws_name: raise ValueError( "One and only one of 'ws_id', 'ws_name', 'workspaces' " + "parameters should be set") workspaces = [self._get_workspace_name_or_id(ws_id, ws_name)] return self._list_objects_with_sets(workspaces, types, include_metadata, include_data_palettes) def _list_objects_with_sets(self, workspaces, types, include_metadata, include_data_palettes): type_map = None if types is not None: type_map = {key: True for key in types} processed_refs = {} data = [] if self.DEBUG: print("NarrativeManager._list_objects_with_sets: processing sets") t1 = time.time() set_ret = self.set_api_cache.call_method( "list_sets", [{ 'workspaces': workspaces, 'include_set_item_info': 1, 'include_metadata': include_metadata }], self.token) sets = set_ret['sets'] for set_info in sets: # Process target_set_items = [] for set_item in set_info['items']: target_set_items.append(set_item['info']) if self._check_info_type(set_info['info'], type_map): data_item = { 'object_info': set_info['info'], 'set_items': { 'set_items_info': target_set_items } } data.append(data_item) processed_refs[set_info['ref']] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t1) + ")") if self.DEBUG: print("NarrativeManager._list_objects_with_sets: loading ws_info") t2 = time.time() ws_info_list = [] # for ws in workspaces: if len(workspaces) == 1: ws = workspaces[0] ws_id = None ws_name = None if str(ws).isdigit(): ws_id = int(ws) else: ws_name = str(ws) ws_info_list.append( self.ws.get_workspace_info({ "id": ws_id, "workspace": ws_name })) else: ws_map = {key: True for key in workspaces} for ws_info in self.ws.list_workspace_info({'perm': 'r'}): if ws_info[1] in ws_map or str(ws_info[0]) in ws_map: ws_info_list.append(ws_info) if self.DEBUG: print(" (time=" + str(time.time() - t2) + ")") if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: loading workspace objects" ) t3 = time.time() for info in WorkspaceListObjectsIterator( self.ws, ws_info_list=ws_info_list, list_objects_params={'includeMetadata': include_metadata}): item_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) if item_ref not in processed_refs and self._check_info_type( info, type_map): data_item = {'object_info': info} data.append(data_item) processed_refs[item_ref] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t3) + ")") return_data = {"data": data} if include_data_palettes == 1: if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: processing DataPalettes" ) t5 = time.time() dp_ret = self.data_palette_client.call_method( "list_data", [{ 'workspaces': workspaces, 'include_metadata': include_metadata }], self.token) for item in dp_ret['data']: ref = item['ref'] if self._check_info_type(item['info'], type_map): data_item = None if ref in processed_refs: data_item = processed_refs[ref] else: data_item = {'object_info': item['info']} processed_refs[ref] = data_item data.append(data_item) dp_info = {} if 'dp_ref' in item: dp_info['ref'] = item['dp_ref'] if 'dp_refs' in item: dp_info['refs'] = item['dp_refs'] data_item['dp_info'] = dp_info return_data["data_palette_refs"] = dp_ret['data_palette_refs'] if self.DEBUG: print(" (time=" + str(time.time() - t5) + ")") return return_data def _check_info_type(self, info, type_map): if type_map is None: return True obj_type = info[2].split('-')[0] return type_map.get(obj_type, False) def copy_narrative(self, newName, workspaceRef, workspaceId): time_ms = int(round(time.time() * 1000)) newWsName = self.user_id + ':narrative_' + str(time_ms) # add the 'narrative' field to newWsMeta later. newWsMeta = {"narrative_nice_name": newName, "searchtags": "narrative"} # start with getting the existing narrative object. currentNarrative = self.ws.get_objects([{'ref': workspaceRef}])[0] if not workspaceId: workspaceId = currentNarrative['info'][6] # Let's prepare exceptions for clone the workspace. # 1) currentNarrative object: excluded_list = [{'objid': currentNarrative['info'][0]}] # 2) let's exclude objects of types under DataPalette handling: ## DP CODE # data_palette_type = "DataPalette.DataPalette" # excluded_types = [data_palette_type] # excluded_types.extend(self.DATA_PALETTES_TYPES.keys()) # add_to_palette_list = [] # dp_detected = False ## END DP CODE # for obj_type in excluded_types: # list_objects_params = {'type': obj_type} ## DP CODE # if obj_type == data_palette_type: # list_objects_params['showHidden'] = 1 ## END DP CODE # for info in WorkspaceListObjectsIterator(self.ws, # ws_id=workspaceId, # list_objects_params=list_objects_params): ## DP CODE # if obj_type == data_palette_type: # dp_detected = True # else: # add_to_palette_list.append({ # 'ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) # }) ## END DP CODE # excluded_list.append({'objid': info[0]}) # clone the workspace EXCEPT for currentNarrative object newWsId = self.ws.clone_workspace({ 'wsi': { 'id': workspaceId }, 'workspace': newWsName, 'meta': newWsMeta, 'exclude': excluded_list })[0] try: ## DP CODE # if dp_detected: # self.data_palette_client.call_method( # "copy_palette", # [{'from_workspace': str(workspaceId), 'to_workspace': str(newWsId)}], # self.token # ) # if len(add_to_palette_list) > 0: # # There are objects in source workspace that have type under DataPalette handling # # but these objects are physically stored in source workspace rather that saved # # in DataPalette object. So they weren't copied by "dps.copy_palette". # self.data_palette_client.call_method( # "add_to_palette", # [{'workspace': str(newWsId), 'new_refs': add_to_palette_list}], # self.token # ) ## END DP CODE # update the ref inside the narrative object and the new workspace metadata. newNarMetadata = currentNarrative['info'][10] newNarMetadata['name'] = newName newNarMetadata['ws_name'] = newWsName newNarMetadata['job_info'] = json.dumps({ 'queue_time': 0, 'running': 0, 'completed': 0, 'run_time': 0, 'error': 0 }) is_temporary = newNarMetadata.get('is_temporary', 'false') if 'is_temporary' not in newNarMetadata: if newNarMetadata['name'] == 'Untitled' or newNarMetadata[ 'name'] is None: is_temporary = 'true' newNarMetadata['is_temporary'] = is_temporary currentNarrative['data']['metadata']['name'] = newName currentNarrative['data']['metadata']['ws_name'] = newWsName currentNarrative['data']['metadata']['job_ids'] = { 'apps': [], 'methods': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } } # save the shiny new Narrative so it's at version 1 newNarInfo = self.ws.save_objects({ 'id': newWsId, 'objects': [{ 'type': currentNarrative['info'][2], 'data': currentNarrative['data'], 'provenance': currentNarrative['provenance'], 'name': currentNarrative['info'][1], 'meta': newNarMetadata }] }) # now, just update the workspace metadata to point # to the new narrative object if 'worksheets' in currentNarrative['data']: # handle legacy. num_cells = len( currentNarrative['data']['worksheets'][0]['cells']) else: num_cells = len(currentNarrative['data']['cells']) newNarId = newNarInfo[0][0] self.ws.alter_workspace_metadata({ 'wsi': { 'id': newWsId }, 'new': { 'narrative': str(newNarId), 'is_temporary': is_temporary, 'cell_count': str(num_cells) } }) return {'newWsId': newWsId, 'newNarId': newNarId} except Exception: # let's delete copy of workspace so it's out of the way - it's broken self.ws.delete_workspace({'id': newWsId}) raise def create_new_narrative(self, app, method, appparam, appData, markdown, copydata, importData, includeIntroCell, title): if app and method: raise ValueError( "Must provide no more than one of the app or method params") if not importData and copydata: importData = copydata.split(';') if not appData and appparam: appData = [] for tmp_item in appparam.split(';'): tmp_tuple = tmp_item.split(',') step_pos = None if tmp_tuple[0]: try: step_pos = int(tmp_tuple[0]) except ValueError: pass appData.append([step_pos, tmp_tuple[1], tmp_tuple[2]]) cells = None if app: cells = [{"app": app}] elif method: cells = [{"method": method}] elif markdown: cells = [{"markdown": markdown}] narr_info = self._create_temp_narrative(cells, appData, importData, includeIntroCell, title) if title is not None: # update workspace info so it's not temporary pass return narr_info def _get_intro_markdown(self): """ Creates and returns a cell with the introductory text included. """ # Load introductory markdown text with open(self.intro_md_file) as intro_file: intro_md = intro_file.read() return intro_md def _create_temp_narrative(self, cells, parameters, importData, includeIntroCell, title): # Migration to python of JavaScript class from https://github.com/kbase/kbase-ui/blob/4d31151d13de0278765a69b2b09f3bcf0e832409/src/client/modules/plugins/narrativemanager/modules/narrativeManager.js#L414 narr_id = int(round(time.time() * 1000)) workspaceName = self.user_id + ':narrative_' + str(narr_id) narrativeName = "Narrative." + str(narr_id) ws = self.ws ws_info = ws.create_workspace({ 'workspace': workspaceName, 'description': '' }) [narrativeObject, metadataExternal ] = self._fetchNarrativeObjects(workspaceName, cells, parameters, includeIntroCell, title) is_temporary = 'true' if title is not None and title != 'Untitled': is_temporary = 'false' metadataExternal['is_temporary'] = is_temporary objectInfo = ws.save_objects({ 'workspace': workspaceName, 'objects': [{ 'type': 'KBaseNarrative.Narrative', 'data': narrativeObject, 'name': narrativeName, 'meta': metadataExternal, 'provenance': [{ 'script': 'NarrativeManager.py', 'description': 'Created new ' + 'Workspace/Narrative bundle.' }], 'hidden': 0 }] })[0] objectInfo = ServiceUtils.object_info_to_object(objectInfo) ws_info = self._completeNewNarrative(ws_info[0], objectInfo['id'], importData, is_temporary, title, len(narrativeObject['cells'])) return { 'workspaceInfo': ServiceUtils.workspace_info_to_object(ws_info), 'narrativeInfo': objectInfo } def _fetchNarrativeObjects(self, workspaceName, cells, parameters, includeIntroCell, title): if not cells: cells = [] if not title: title = 'Untitled' # fetchSpecs appSpecIds = [] methodSpecIds = [] specMapping = {'apps': {}, 'methods': {}} for cell in cells: if 'app' in cell: appSpecIds.append(cell['app']) elif 'method' in cell: methodSpecIds.append(cell['method']) nms = NarrativeMethodStore(self.narrativeMethodStoreURL, token=self.token) if len(appSpecIds) > 0: appSpecs = nms.get_app_spec({'ids': appSpecIds}) for spec in appSpecs: spec_id = spec['info']['id'] specMapping['apps'][spec_id] = spec if len(methodSpecIds) > 0: methodSpecs = nms.get_method_spec({'ids': methodSpecIds}) for spec in methodSpecs: spec_id = spec['info']['id'] specMapping['methods'][spec_id] = spec # end of fetchSpecs metadata = { 'job_ids': { 'methods': [], 'apps': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } }, 'format': 'ipynb', 'creator': self.user_id, 'ws_name': workspaceName, 'name': title, 'type': 'KBaseNarrative.Narrative', 'description': '', 'data_dependencies': [] } cellData = self._gatherCellData(cells, specMapping, parameters, includeIntroCell) narrativeObject = { 'nbformat_minor': 0, 'cells': cellData, 'metadata': metadata, 'nbformat': 4 } metadataExternal = {} for key in metadata: value = metadata[key] if isinstance(value, str): metadataExternal[key] = value else: metadataExternal[key] = json.dumps(value) return [narrativeObject, metadataExternal] def _gatherCellData(self, cells, specMapping, parameters, includeIntroCell): cell_data = [] if includeIntroCell == 1: cell_data.append({ 'cell_type': 'markdown', 'source': self._get_intro_markdown(), 'metadata': {} }) for cell_pos, cell in enumerate(cells): if 'app' in cell: cell_data.append( self._buildAppCell(len(cell_data), specMapping['apps'][cell['app']], parameters)) elif 'method' in cell: cell_data.append( self._buildMethodCell( len(cell_data), specMapping['methods'][cell['method']], parameters)) elif 'markdown' in cell: cell_data.append({ 'cell_type': 'markdown', 'source': cell['markdown'], 'metadata': {} }) else: raise ValueError("cannot add cell #" + str(cell_pos) + ", unrecognized cell content") return cell_data def _buildAppCell(self, pos, spec, params): cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4()) cell = { "cell_type": "markdown", "source": "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeAppCell({'appSpec' : '" + self._safeJSONStringify(spec) + "', 'cellId' : '" + cellId + "'});" + "</script>", "metadata": {} } cellInfo = {} widgetState = [] cellInfo[self.KB_TYPE] = self.KB_APP_CELL cellInfo['app'] = spec if params: steps = {} for param in params: stepid = 'step_' + str(param[0]) if stepid not in steps: steps[stepid] = {} steps[stepid]['inputState'] = {} steps[stepid]['inputState'][param[1]] = param[2] state = {'state': {'step': steps}} widgetState.append(state) cellInfo[self.KB_STATE] = widgetState cell['metadata'][self.KB_CELL] = cellInfo return cell def _buildMethodCell(self, pos, spec, params): cellId = "kb-cell-" + str(pos) + "-" + str(uuid.uuid4()) cell = { "cell_type": "markdown", "source": "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeMethodCell({'method' : '" + self._safeJSONStringify(spec) + "'});" + "</script>", "metadata": {} } cellInfo = {"method": spec, "widget": spec["widgets"]["input"]} cellInfo[self.KB_TYPE] = self.KB_FUNCTION_CELL widgetState = [] if params: wparams = {} for param in params: wparams[param[1]] = param[2] widgetState.append({"state": wparams}) cellInfo[self.KB_STATE] = widgetState cell["metadata"][self.KB_CELL] = cellInfo return cell def _completeNewNarrative(self, workspaceId, objectId, importData, is_temporary, title, num_cells): """ 'Completes' the new narrative by updating workspace metadata with the required fields and copying in data from the importData list of references. """ new_meta = { 'narrative': str(objectId), 'is_temporary': is_temporary, 'searchtags': 'narrative', 'cell_count': str(num_cells) } if is_temporary == 'false' and title is not None: new_meta['narrative_nice_name'] = title self.ws.alter_workspace_metadata({ 'wsi': { 'id': workspaceId }, 'new': new_meta }) # copy_to_narrative: if importData: objectsToCopy = [{'ref': x} for x in importData] infoList = self.ws.get_object_info_new({ 'objects': objectsToCopy, 'includeMetadata': 0 }) for item in infoList: objectInfo = ServiceUtils.object_info_to_object(item) self.copy_object(objectInfo['ref'], workspaceId, None, None, objectInfo) return self.ws.get_workspace_info({'id': workspaceId}) def _safeJSONStringify(self, obj): return json.dumps(self._safeJSONStringifyPrepare(obj)) def _safeJSONStringifyPrepare(self, obj): if isinstance(obj, str): return obj.replace("'", "'").replace('"', """) elif isinstance(obj, list): for pos in range(len(obj)): obj[pos] = self._safeJSONStringifyPrepare(obj[pos]) elif isinstance(obj, dict): obj_keys = list(obj.keys()) for key in obj_keys: obj[key] = self._safeJSONStringifyPrepare(obj[key]) else: pass # it's boolean/int/float/None return obj def _get_workspace_name_or_id(self, ws_id, ws_name): ret = ws_name if not ret: ret = str(ws_id) return ret def copy_object(self, ref, target_ws_id, target_ws_name, target_name, src_info): """ Copies an object from one workspace to another. """ if not target_ws_id and not target_ws_name: raise ValueError("Neither target workspace id nor name is defined") if not src_info: src_info_tuple = self.ws.get_object_info_new({ 'objects': [{ 'ref': ref }], 'includeMetadata': 0 })[0] src_info = ServiceUtils.object_info_to_object(src_info_tuple) if not target_name: target_name = src_info['name'] obj_info_tuple = self.ws.copy_object({ 'from': { 'ref': ref }, 'to': { 'wsid': target_ws_id, 'workspace': target_ws_name, 'name': target_name } }) obj_info = ServiceUtils.object_info_to_object(obj_info_tuple) return {'info': obj_info} def list_available_types(self, workspaces): data = self.list_objects_with_sets(workspaces=workspaces)['data'] type_stat = {} for item in data: info = item['object_info'] obj_type = info[2].split('-')[0] if obj_type in type_stat: type_stat[obj_type] += 1 else: type_stat[obj_type] = 1 return {'type_stat': type_stat}
def stage_input(self, input_ref, fasta_file_extension): ''' Stage input based on an input data reference for CheckM input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome This method creates a directory in the scratch area with the set of Fasta files, names will have the fasta_file_extension parameter tacked on. ex: staged_input = stage_input('124/15/1', 'fna') staged_input {"input_dir": '...'} ''' # config #SERVICE_VER = 'dev' SERVICE_VER = 'release' [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11) # object_info tuple ws = Workspace(self.ws_url) # 1) generate a folder in scratch to hold the input suffix = str(int(time.time() * 1000)) input_dir = os.path.join(self.scratch, 'bins_' + suffix) all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension) if not os.path.exists(input_dir): os.makedirs(input_dir) # 2) based on type, download the files obj_name = self.get_data_obj_name (input_ref) type_name = self.get_data_obj_type (input_ref) # auClient try: auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # setAPI_Client try: #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token']) # for dynamic service except Exception as e: raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e)) # mguClient try: mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER) except Exception as e: raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e)) # Standard Single Assembly # if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']: # create file data filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # AssemblySet # elif type_name == 'KBaseSets.AssemblySet': # read assemblySet try: assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1}) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e)) assembly_refs = [] assembly_names = [] for assembly_item in assemblySet_obj['data']['items']: this_assembly_ref = assembly_item['ref'] # assembly obj info try: this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0] this_assembly_name = this_assembly_info[NAME_I] except Exception as e: raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e)) assembly_refs.append(this_assembly_ref) assembly_names.append(this_assembly_name) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(assembly_refs): this_name = assembly_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Binned Contigs # elif type_name == 'KBaseMetagenomes.BinnedContigs': # download the bins as fasta and set the input folder name bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory'] os.rename(bin_file_dir, input_dir) # make sure fasta file isn't empty self.set_fasta_file_extensions(input_dir, fasta_file_extension) for (dirpath, dirnames, filenames) in os.walk(input_dir): for fasta_file in filenames: fasta_path = os.path.join (input_dir,fasta_file) min_fasta_len = 1 if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len): raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path)) break # Genome and GenomeSet # elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet': genome_obj_names = [] genome_sci_names = [] genome_assembly_refs = [] if type_name == 'KBaseGenomes.Genome': genomeSet_refs = [input_ref] else: # get genomeSet_refs from GenomeSet object genomeSet_refs = [] try: genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data'] except Exception as e: raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e)) #to get the full stack trace: traceback.format_exc() # iterate through genomeSet members for genome_id in genomeSet_object['elements'].keys(): if 'ref' not in genomeSet_object['elements'][genome_id] or \ genomeSet_object['elements'][genome_id]['ref'] == None or \ genomeSet_object['elements'][genome_id]['ref'] == '': raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref)) else: genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref']) # genome obj data for i,this_input_ref in enumerate(genomeSet_refs): try: objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data'] genome_obj = objects[0]['data'] genome_obj_info = objects[0]['info'] genome_obj_names.append(genome_obj_info[NAME_I]) genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError ("unable to fetch genome: "+this_input_ref) # Get genome_assembly_ref if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." raise ValueError (msg) continue elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref']) print (msg) genome_assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None: msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref']) print (msg) genome_assembly_refs.append(genome_obj['contigset_ref']) # create file data (name for file is what's reported in results) for ass_i,assembly_ref in enumerate(genome_assembly_refs): this_name = genome_obj_names[ass_i] filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension) auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename}) if not os.path.isfile(filename): raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil') # make sure fasta file isn't empty min_fasta_len = 1 if not self.fasta_seq_len_at_least(filename, min_fasta_len): raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename)) # Unknown type slipped through # else: raise ValueError('Cannot stage fasta file input directory from type: ' + type_name) # create summary fasta file with all bins self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta) return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
class masurca_utils: """ masurca_utils: defining a system of utils for running masurca """ MaSuRCA_VERSION = 'MaSuRCA-3.2.9' MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca' PARAM_IN_WS = 'workspace_name' PARAM_IN_THREADN = 'num_threads' PARAM_IN_READS_LIBS = 'reads_libraries' PARAM_IN_JUMP_LIBS = 'jump_libraries' PARAM_IN_JF_SIZE = 'jf_size' PARAM_IN_CS_NAME = 'output_contigset_name' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') def __init__(self, prj_dir, config): self.workspace_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] if 'shock-url' in config: self.shock_url = config['shock-url'] if 'handle-service-url' in config: self.handle_url = config['handle-service-url'] self.ws_client = Workspace(self.workspace_url, token=self.token) self.ru = ReadsUtils(self.callback_url, token=self.token) self.au = AssemblyUtil(self.callback_url, token=self.token) self.kbr = KBaseReport(self.callback_url) self.kbq = kb_quast(self.callback_url) self.proj_dir = prj_dir self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir) def _has_long_reads(self, params): """ _has_long_reads: check if a long reads input exists in the parameters """ return (params.get('pacbio_reads', None) or params.get('nanopore_reads', None) or params.get('other_frg_file', None)) def _get_data_portion(self, pe_reads_data, jp_reads_data=None, pacbio_reads_file='', nanopore_reads_file='', other_frg_file=''): """ _get_data_portion: build the 'DATA...END' portion for the config.txt file """ data_str = '' if pe_reads_data: # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1))) for pe in pe_reads_data: if data_str != '': data_str += '\n' data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \ str(pe['pe_stdev']) + ' ' + pe['fwd_file'] if pe.get('rev_file', None): data_str += ' ' + pe['rev_file'] if jp_reads_data: # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1))) for jp in jp_reads_data: if data_str != '': data_str += '\n' data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \ str(jp['jp_stdev']) + ' ' + jp['fwd_file'] if jp.get('rev_file', None): data_str += ' ' + jp['rev_file'] # Adding the pacbio_reads # Note that pcbio reads must be in a single fasta file! # For example: # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta' # ***if you have both types of reads supply them both as NANOPORE type*** if pacbio_reads_file != '': if data_str != '': data_str += '\n' if nanopore_reads_file != '': data_str += 'NANOPORE=' + pacbio_reads_file else: data_str += 'PACBIO=' + pacbio_reads_file # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file! # For example: # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta' if nanopore_reads_file != '': if data_str != '': data_str += '\n' data_str += 'NANOPORE= ' + nanopore_reads_file # Adding the other_frg_file inputs if any # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into # Celera Assembler compatible .frg file # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg if other_frg_file != '': if data_str != '': data_str += '\n' data_str += 'OTHER=' + other_frg_file return data_str def _get_parameters_portion(self, params): """ build the 'PARAMETERS...END' portion for the config.txt file """ # set the default parameters as suggested in the example configuration file param_str = ( "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE" + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0") if (params.get('graph_kmer_size', None) and type(params['graph_kmer_size']) == int): if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size']) else: if param_str != '': param_str += '\n' param_str += 'GRAPH_KMER_SIZE=auto' if params.get('use_linking_mates', None): if param_str != '': param_str += '\n' if params['use_linking_mates'] == 1 and not self._has_long_reads( params): param_str += 'USE_LINKING_MATES=1' else: param_str += 'USE_LINKING_MATES=0' if params.get('limit_jump_coverage', None): if param_str != '': param_str += '\n' param_str += 'LIMIT_JUMP_COVERAGE = ' + str( params['limit_jump_coverage']) if params.get('cgwErrorRate', None): if param_str != '': param_str += '\n' param_str += 'CA_PARAMETERS = cgwErrorRate=' + str( params['cgwErrorRate']) if params.get(self.PARAM_IN_THREADN, None): if param_str != '': param_str += '\n' param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN]) if params.get('jf_size', None): if param_str != '': param_str += '\n' param_str += 'JF_SIZE=' + str(params['jf_size']) if params.get('kmer_count_threshold', None): if param_str != '': param_str += '\n' param_str += 'KMER_COUNT_THRESHOLD=' + str( params['kmer_count_threshold']) if params.get('do_homopolymer_trim', None): if param_str != '': param_str += '\n' if params['do_homopolymer_trim'] == 1: param_str += 'DO_HOMOPOLYMER_TRIM=1' else: param_str += 'DO_HOMOPOLYMER_TRIM=0' if params.get('close_gaps', None): if param_str != '': param_str += '\n' if params['close_gaps'] == 1: param_str += 'CLOSE_GAPS=1' else: param_str += 'CLOSE_GAPS=0' if params.get('soap_assembly', None): if param_str != '': param_str += '\n' if params['soap_assembly'] == 1: param_str += 'SOAP_ASSEMBLY=1' else: param_str += 'SOAP_ASSEMBLY=0' return param_str def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt): """ replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text examples of parameters: begin_patn1 = "DATA\n" begin_patn2 = "PARAMETERS\n" end_patn1 = "END\nPARAMETERS\n" end_patn2 = "END\n" repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' + ' /kb/module/work/testReads/small.reverse.fq\n') repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' + 'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n DO_HOMOPOLYMER_TRIM=0\n') """ if repl_txt != '': # create regular expression pattern repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL) repl_txt = begin_patn + repl_txt + '\n' + end_patn # replace the text between begin_patn and end_patn with repl_txt txt_replaced = repl.sub(repl_txt, orig_txt) # pprint(txt_replaced) return txt_replaced else: return orig_txt def _unique_prefix_check(self, pfix, refs): prefix_lookup = {} for ref in refs: pre = ref[pfix][0:2] if pre not in prefix_lookup: prefix_lookup[pre] = 1 else: raise ValueError('The first two characters in \'' + ref[pfix] + '\' has been used.') def _get_pereads_info(self, input_params): """ _get_pereads_info--from a list of paired_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'pe_prefix': the two-letter prefix for the reads library, 'pe_mean': the average reads length for the reads library, 'pe_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # reads_libraries grouped params if rds_params.get(self.PARAM_IN_READS_LIBS, None): pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS] for pe_lib in pe_reads_libs: if pe_lib.get('pe_id', None): rds_refs.append(pe_lib['pe_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for pe_lib in pe_reads_libs: i = 0 for rds in rds_data: i += 1 if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[ 'reads_ref']: if pe_lib.get('pe_prefix', None): rds['pe_prefix'] = pe_lib['pe_prefix'][0] else: rds['pe_prefix'] = 'p' rds['pe_prefix'] += str(i) pe_lib['pe_prefix'] = rds['pe_prefix'] if pe_lib.get('pe_mean', None) is None: pe_lib['pe_mean'] = 500 rds['pe_mean'] = pe_lib['pe_mean'] if pe_lib.get('pe_stdev', None) is None: pe_lib['pe_stdev'] = 50 rds['pe_stdev'] = pe_lib['pe_stdev'] self._unique_prefix_check('pe_prefix', pe_reads_libs) else: raise ValueError("Parameter {} is required.".format( self.PARAM_IN_READS_LIBS)) return rds_data def _get_jpreads_info(self, input_params): """ _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding reads info with the paired_readsParams[pe_id] returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'jp_prefix': the two-letter prefix for the reads library, 'jp_mean': the average reads length for the reads library, 'jp_stdev': the standard deviation for the reads library, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ rds_params = copy.deepcopy(input_params) wsname = rds_params[self.PARAM_IN_WS] rds_refs = [] rds_data = [] # jump_libraries grouped params if rds_params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS] for jp_lib in jp_reads_libs: if jp_lib.get('jp_id', None): rds_refs.append(jp_lib['jp_id']) rds_data = self._get_kbreads_info(wsname, rds_refs) for jp_lib in jp_reads_libs: i = 0 for rds in rds_data: i += 1 if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[ 'reads_ref']: if jp_lib.get('jp_prefix', None): rds['jp_prefix'] = jp_lib['jp_prefix'][0] else: rds['jp_prefix'] = 's' rds['jp_prefix'] += str(i) jp_lib['jp_prefix'] = rds['jp_prefix'] if jp_lib.get('jp_mean', None) is None: jp_lib['jp_mean'] = 3600 rds['jp_mean'] = jp_lib['jp_mean'] if jp_lib.get('jp_stdev', None) is None: jp_lib['jp_stdev'] = 200 rds['jp_stdev'] = jp_lib['jp_stdev'] self._unique_prefix_check('jp_prefix', jp_reads_libs) return rds_data def _get_kbreads_info(self, wsname, reads_refs): """ _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding reads info with as deinterleaved fastq files and returns a list of reads data in the following structure: reads_data = { 'fwd_file': path_to_fastq_file, 'type': reads_type, #('interleaved', 'paired', or 'single' 'seq_tech': sequencing_tech, 'reads_ref': KBase object ref for downstream convenience, 'reads_name': KBase object name for downstream convenience, 'rev_file': path_to_fastq_file, #only if paired end } """ obj_ids = [] for r in reads_refs: if r: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) if not obj_ids: return [] ws_info = self.ws_client.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = self.ru.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false' })['files'] except ServerError as se: log('logging stacktrace from dynamic client error') log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary ' + 'KBaseFile.SingleEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise # log('Downloaded reads data from KBase:\n' + pformat(reads)) reads_data = [] for ref in reads_refs: reads_name = reftoname[ref] f = reads[ref]['files'] seq_tech = reads[ref]['sequencing_tech'] rds_info = { 'fwd_file': f['fwd'], 'reads_ref': ref, 'type': f['type'], 'seq_tech': seq_tech, 'reads_name': reads_name } if f.get('rev', None) is not None: rds_info['rev_file'] = f['rev'] reads_data.append(rds_info) return reads_data def _generate_output_file_list(self, out_dir): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() output_directory = os.path.join(self.proj_dir, str(uuid.uuid4())) mkdir_p(output_directory) masurca_output = os.path.join(output_directory, 'masurca_output.zip') self._zip_folder(out_dir, masurca_output) output_files.append({ 'path': masurca_output, 'name': os.path.basename(masurca_output), 'label': os.path.basename(masurca_output), 'description': 'Output file(s) generated by MaSuRCA' }) return output_files def _zip_folder(self, folder_path, output_path): """ _zip_folder: Zip the contents of an entire folder (with that folder included in the archive). Empty subfolders could be included in the archive as well if the commented portion is used. """ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as ziph: for root, folders, files in os.walk(folder_path): for f in files: absolute_path = os.path.join(root, f) relative_path = os.path.join(os.path.basename(root), f) # print "Adding {} to archive.".format(absolute_path) ziph.write(absolute_path, relative_path) print("{} created successfully.".format(output_path)) # with zipfile.ZipFile(output_path, "r") as f: # print 'Checking the zipped file......\n' # for info in f.infolist(): # print info.filename, info.date_time, info.file_size, info.compress_size def _load_stats(self, input_file_name): log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly') log('Building Object.') if not os.path.isfile(input_file_name): raise Exception('The input file name {0} is not a file!'.format( input_file_name)) with open(input_file_name, 'r') as input_file_handle: contig_id = None sequence_len = 0 fasta_dict = dict() first_header_found = False # Pattern for replacing white space pattern = re.compile(r'\s+') for current_line in input_file_handle: if current_line[0] == '>': # found a header line # Wrap up previous fasta sequence if not first_header_found: first_header_found = True else: fasta_dict[contig_id] = sequence_len sequence_len = 0 fasta_header = current_line.replace('>', '').strip() try: contig_id = fasta_header.strip().split(' ', 1)[0] except (IndexError, KeyError, ValueError): contig_id = fasta_header.strip() else: sequence_len += len(re.sub(pattern, '', current_line)) # wrap up last fasta sequence if not first_header_found: raise Exception("There are no contigs in this file") else: fasta_dict[contig_id] = sequence_len return fasta_dict def _check_reference(self, ref): """ Tests the given ref string to make sure it conforms to the expected object reference format. Returns True if it passes, False otherwise. """ obj_ref_regex = re.compile( "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$") ref_path = ref.strip().split(";") for step in ref_path: if not obj_ref_regex.match(step): return False return True def _check_ref_type(self, ref, allowed_types): """ Validates the object type of ref against the list of allowed types. If it passes, this returns True, otherwise False. Really, all this does is verify that at least one of the strings in allowed_types is a substring of the ref object type name. Ex1: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "KBaseFile.Assembly"] returns False Ex2: ref = "KBaseGenomes.Genome-4.0" allowed_types = ["assembly", "genome"] returns True """ obj_type = self._get_object_type(ref).lower() for t in allowed_types: if t.lower() in obj_type: return True return False def _get_object_type(self, ref): """ Fetches and returns the typed object name of ref from the given workspace url. If that object doesn't exist, or there's another Workspace error, this raises a RuntimeError exception. """ info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]}) obj_info = info.get('infos', [[]])[0] if len(obj_info) == 0: raise RuntimeError( "An error occurred while fetching type info from the Workspace. " "No information returned for reference {}".format(ref)) return obj_info[2] def _get_fasta_from_assembly(self, assembly_ref): """ From an assembly or contigset, this uses a data file to build a FASTA file and return the path to it. """ allowed_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] if not self._check_ref_type(assembly_ref, allowed_types): raise ValueError( "The reference {} cannot be used to fetch a FASTA file".format( assembly_ref)) au = AssemblyUtil(self.callback_url) return au.get_assembly_as_fasta({'ref': assembly_ref}) def generate_report(self, contig_file_name, params, out_dir, wsname): """ generate_report: reporting results """ log('Generating and saving report') contig_file_with_path = os.path.join(out_dir, contig_file_name) fasta_stats = self._load_stats(contig_file_with_path) lengths = [fasta_stats[contig_id] for contig_id in fasta_stats] assembly_ref = params[self.PARAM_IN_WS] + '/' + params[ self.PARAM_IN_CS_NAME] report_text = '' report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n' report_text += 'Assembly saved to: ' + assembly_ref + '\n' report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report_text += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n' for c in range(bins): report_text += (' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n') print('Running QUAST') quastret = self.kbq.run_QUAST({ 'files': [{ 'path': contig_file_with_path, 'label': params[self.PARAM_IN_CS_NAME] }] }) output_files = self._generate_output_file_list(out_dir) print('Saving report') report_output = self.kbr.create_extended_report({ 'message': report_text, 'objects_created': [{ 'ref': assembly_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'file_links': output_files, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_masurca_report_' + str(uuid.uuid4()), 'workspace_name': params[self.PARAM_IN_WS] }) report_name = report_output['name'] report_ref = report_output['ref'] return report_name, report_ref def validate_params(self, params): """ validate_params: checks params passed to run_masurca_app method and set default values """ # log('Start validating run_masurca_app parameters:\n{}'.format( # json.dumps(params, indent=1))) # check for mandatory parameters if params.get(self.PARAM_IN_WS, None) is None: raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory') if self.PARAM_IN_THREADN not in params: raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory') if params.get(self.PARAM_IN_JF_SIZE, None) is None: raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory') if params.get(self.PARAM_IN_READS_LIBS, None) is None: raise ValueError(self.PARAM_IN_READS_LIBS + ' parameter is mandatory') if type(params[self.PARAM_IN_READS_LIBS]) != list: raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list') if params.get(self.PARAM_IN_CS_NAME, None) is None: raise ValueError('Parameter {} is mandatory!'.format( self.PARAM_IN_CS_NAME)) if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]): raise ValueError('Invalid workspace object name: {}.'.format( params[self.PARAM_IN_CS_NAME])) if 'dna_source' in params: dna_src = params.get('dna_source') if dna_src == 'bacteria': params['limit_jump_coverage'] = 60 params['cgwErrorRate'] = 0.25 else: params['limit_jump_coverage'] = 300 params['cgwErrorRate'] = 0.15 if params.get('create_report', None) is None: params['create_report'] = 0 return params def construct_masurca_assembler_cfg(self, params): # STEP 1: get the working folder housing the config.txt file and the masurca results wsname = params[self.PARAM_IN_WS] config_file_path = os.path.join(self.proj_dir, 'config.txt') # STEP 2.1: retrieve the reads data from input parameter pe_reads_data = self._get_pereads_info(params) jp_reads_data = [] if params.get(self.PARAM_IN_JUMP_LIBS, None): jp_reads_data = self._get_jpreads_info(params) if 'jp_mean' not in params or type(params['jp_mean']) != int: params['jp_mean'] = 3600 if 'jp_stdev' not in params or type(params['jp_stdev']) != int: params['jp_stdev'] = 200 # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa; assbl_types = [ 'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet' ] reads_types = [ 'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary', 'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary' ] pb_reads_file = '' if params.get('pacbio_reads', None): pb_ref = params['pacbio_reads'] if self._check_ref_type(pb_ref, assbl_types): pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get( 'path', '') else: if self._check_ref_type(pb_ref, reads_types): pb_rd = self._get_kbreads_info(wsname, [pb_ref]) pb_reads_file = pb_rd[0]['fwd_file'] if pb_rd[0].get('rev_file', None): pb_reads_file += ' ' + pb_rd[0]['rev_file'] # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied # as NANOPORE=reads.fa np_reads_file = '' if params.get('nanopore_reads', None): np_ref = params['nanopore_reads'] if self._check_ref_type(np_ref, assbl_types): np_reads_file = (self._get_fasta_from_assembly(np_ref)).get( 'path', '') else: if self._check_ref_type(np_ref, reads_types): np_rd = self._get_kbreads_info(wsname, [np_ref]) np_reads_file = np_rd[0]['fwd_file'] if np_rd[0].get('rev_file', None): np_reads_file += ' ' + np_rd[0]['rev_file'] # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first # converted into Celera Assembler compatible .frg files # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg other_frg = '' if params.get('other_frg_file', None): other_frg = params['other_frg_file'] # STEP 3: construct and save the config.txt file for running masurca try: # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file data_str = self._get_data_portion(pe_reads_data, jp_reads_data, pb_reads_file, np_reads_file, other_frg) if data_str == '': # no reads libraries are specified, no further actions return '' config_template = '' with codecs.open(os.path.join(os.path.dirname(__file__), 'config_template.txt'), mode='r', encoding='utf-8') as config_template_file: config_template = config_template_file.read() begin_patn1 = "DATA\n" end_patn1 = "END\nPARAMETERS\n" config_with_data = self._replaceSectionText( config_template, begin_patn1, end_patn1, data_str) # log("\n***After DATA section replacement:\n{}\nSaved at {}".format( # config_with_data.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(config_with_data) # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above param_str = self._get_parameters_portion(params) if param_str == '': # no parameters are specified, no further actions return '' previous_config = '' with codecs.open(config_file_path, mode='r', encoding='utf-8') as previous_config_file: previous_config = previous_config_file.read() begin_patn2 = "PARAMETERS\n" end_patn2 = "END\n" final_config = self._replaceSectionText(previous_config, begin_patn2, end_patn2, param_str) log("\n***Configuration file content:\n{}\nSaved at {}".format( final_config.encode('utf-8').decode('utf-8'), config_file_path)) with codecs.open(config_file_path, mode='w', encoding='utf-8') as config_file: config_file.write(final_config) except IOError as ioerr: log('Creation of the config.txt file raised error:\n') pprint(ioerr) return '' else: return config_file_path def generate_assemble_script(self, config_file): if os.path.isfile(config_file): f_dir, f_nm = os.path.split(config_file) m_cmd = [self.MaSuRCA_BIN] m_cmd.append(config_file) try: self.prog_runner.run(m_cmd, f_dir) assemble_file = os.path.join(f_dir, 'assemble.sh') log('Created the assemble.sh file at {}.\n'.format( assemble_file)) return assemble_file except ValueError as ve: log('Error generating assemble.sh file: \n{}'.format(ve)) raise ValueError('Failed to generate assemble.sh file!') else: log("The config file {} is not found.\n".format(config_file)) log('NO assemble.sh file created.\n') return '' def run_assemble(self, asmbl_file): exit_code = 1 if os.path.isfile(asmbl_file): log("The assemble.sh file exists at {}\n".format(asmbl_file)) f_dir, f_nm = os.path.split(asmbl_file) a_cmd = ['/bin/bash'] a_cmd.append(asmbl_file) log("The working directory is {}\n".format(f_dir)) log("The assembling command is {}\n".format(' '.join(a_cmd))) try: exit_code = self.prog_runner.run(a_cmd, f_dir) except ValueError as ve: log('Error running assemble: \n{}'.format(ve)) else: log("The assemble.sh file {} is not found.".format(asmbl_file)) return exit_code def save_assembly(self, contig_fa, wsname, a_name): if os.path.isfile(contig_fa): log('Uploading FASTA file to Assembly...') self.au.save_assembly_from_fasta({ 'file': { 'path': contig_fa }, 'workspace_name': wsname, 'assembly_name': a_name }) else: log("The contig file {} is not found.".format(contig_fa))
def download_long(self, console, warnings, token, wsname, lib, min_long_read_length): try: # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)} lib_obj_info = wsClient.get_object_info_new({'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) # remove trailing version lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly': # download using assembly util / data file util self.log(console, "Getting long reads (from contigs object).\n") auClient = AssemblyUtil(url=self.callbackURL, token=token) dfuClient = DataFileUtil(url=self.callbackURL, token=token) contigFile = auClient.get_assembly_as_fasta({ 'ref': lib_ref }).get('path') long_reads_path = dfuClient.unpack_file( {'file_path': contig_file})['file_path'] self.log( warnings, "Warning: Long reads are in FASTA format, so short read check was not performed." ) else: ruClient = ReadsUtils(url=self.callbackURL, token=token) self.log(console, "Getting long reads (from reads library object).\n") result = ruClient.download_reads({ 'read_libraries': [lib_ref], 'interleaved': 'false' }) long_reads_path = result['files'][lib_ref]['files']['fwd'] [n_reads, n_reads_short ] = self.filter_short_fastq(console, long_reads_path, min_long_read_length) if (n_reads_short > 0): self.log( warnings, "Warning: Of " + str(n_reads) + " long reads, " + str(n_reads_short) + " are shorter than " + str(min_long_read_length) + "; consider using the filtlong app to filter out shorter reads." ) except Exception as e: raise ValueError('Unable to download long reads\n' + str(e)) return long_reads_path
def download_short_unpaired(self, console, token, wsname, short_unpaired_libraries): try: self.log(console, "Getting short unpaired reads.\n") ruClient = ReadsUtils(url=self.callbackURL, token=token) # first, unpack any ReadsSets into the actual SingleEndLibrary referencs reads_refs = [] # object info try: wsClient = Workspace(self.workspaceURL, token=token) except Exception as e: raise ValueError("unable to instantiate wsClient. " + str(e)) [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple for lib in short_unpaired_libraries: try: obj_id = { 'ref': lib if '/' in lib else (wsname + '/' + lib) } lib_obj_info = wsClient.get_object_info_new( {'objects': [obj_id]})[0] lib_obj_type = lib_obj_info[TYPE_I] # remove trailing version lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type) lib_ref = str(lib_obj_info[WSID_I])+'/' + \ str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I]) if lib_obj_type == 'KBaseSets.ReadsSet': # unpack it try: setAPIClient = SetAPI(url=self.serviceWizardURL, token=token) self.log(console, 'getting reads set ' + lib_ref) readsSet = setAPIClient.get_reads_set_v1({ 'ref': lib_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object: (' + lib_ref + ')\n' + str(e)) for readsLibrary in readsSet['data']['items']: reads_refs.append(readsLibrary['ref']) else: # use other reads objects "as is" reads_refs.append(lib_ref) except Exception as e: raise ValueError('Unable to get read library object: (' + str(lib) + ')' + str(e)) result = ruClient.download_reads({ 'read_libraries': reads_refs, 'interleaved': 'false' }) # combine outputs short_unpaired_path = os.path.join( self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq") self.log(console, "Combining short unpaired reads.\n") for reads_ref in reads_refs: files = result['files'][reads_ref]['files'] if 'fwd' in files: path = files['fwd'] if path.endswith('.gz'): cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path else: cmd = 'cat ' + path + ' >> ' + short_unpaired_path self.log(console, "command: " + cmd) cmdProcess = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) cmdProcess.wait() if cmdProcess.returncode != 0: raise ValueError('Error running ' + cmd) os.remove(path) else: raise ValueError('File ' + reads_ref + ' missing forward reads file') except Exception as e: raise ValueError('Unable to download short unpaired reads\n' + str(e)) return short_unpaired_path
def run_SPAdes(self, ctx, params): """ Run SPAdes on paired end libraries :param params: instance of type "SPAdesParams" (Input parameters for running SPAdes. workspace_name - the name of the workspace from which to take input and store output. output_contigset_name - the name of the output contigset read_libraries - a list of Illumina PairedEndLibrary files in FASTQ or BAM format. dna_source - (optional) the source of the DNA used for sequencing 'single_cell': DNA amplified from a single cell via MDA anything else: Standard DNA sample from multiple cells. Default value is None. min_contig_length - (optional) integer to filter out contigs with length < min_contig_length from the SPAdes output. Default value is 0 implying no filter. kmer_sizes - (optional) K-mer sizes, Default values: 33, 55, 77, 99, 127 (all values must be odd, less than 128 and listed in ascending order) In the absence of these values, K values are automatically selected. skip_error_correction - (optional) Assembly only (No error correction). By default this is disabled.) -> structure: parameter "workspace_name" of String, parameter "output_contigset_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "dna_source" of String, parameter "min_contig_length" of Long, parameter "kmer_sizes" of list of Long, parameter "skip_error_correction" of type "bool" (A boolean. 0 = false, anything else = true.) :returns: instance of type "SPAdesOutput" (Output parameters for SPAdes run. report_name - the name of the KBaseReport.Report workspace object. report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_SPAdes # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_SPAdes with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = Workspace(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=ctx['token']) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) phred_type = self.check_reads(params, reads, reftoname) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] # print ("REF:" + str(ref)) # print ("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({'fwd_file': f['fwd'], 'type': 'paired', 'seq_tech': seq_tech}) elif f['type'] == 'paired': reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech}) elif f['type'] == 'single': reads_data.append({'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech}) else: raise ValueError('Something is very wrong with read lib' + reads_name) kmer_sizes = None if self.PARAM_IN_KMER_SIZES in params and params[self.PARAM_IN_KMER_SIZES] is not None: if (len(params[self.PARAM_IN_KMER_SIZES])) > 0: kmer_sizes = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES]) skip_error_correction = 0 if self.PARAM_IN_SKIP_ERR_CORRECT in params and params[self.PARAM_IN_SKIP_ERR_CORRECT] is not None: if params[self.PARAM_IN_SKIP_ERR_CORRECT] == 1: skip_error_correction = 1 spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE], reads_data, phred_type, kmer_sizes, skip_error_correction) self.log('SPAdes output dir: ' + spades_out) # parse the output and save back to KBase output_contigs = os.path.join(spades_out, 'scaffolds.fasta') self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release') if params.get('min_contig_length', 0) > 0: assemblyUtil.save_assembly_from_fasta( {'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': params['min_contig_length'] }) # load report from scaffolds.fasta.filtered.fa report_name, report_ref = self.load_report( output_contigs + '.filtered.fa', params, wsname) else: assemblyUtil.save_assembly_from_fasta( {'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report( output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref } #END run_SPAdes # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_SPAdes return value ' + 'output is not type dict as required.') # return the results return [output]
def export_genome_as_genbank(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_genbank print('export_genome_as_genbank -- paramaters = ') # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot run export_genome_as_genbank- no "input_ref" field defined.' ) # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_object_info_new({ 'objects': [{ 'ref': params['input_ref'] }], 'includeMetadata': 0, 'ignoreErrors': 0 })[0] genome_to_genbank_params = {'genome_ref': params['input_ref']} # export to file (building from KBase Genome Object) result = self.genome_to_genbank( ctx, genome_to_genbank_params)[0]['genbank_file'] # create the output directory and move the file there export_package_dir = os.path.join(self.cfg.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join(export_package_dir, os.path.basename(result['file_path']))) # export original uploaded GenBank file if it existed. exporter = GenomeToGenbank(self.cfg) original_result_full = exporter.export_original_genbank( ctx, genome_to_genbank_params) if original_result_full is not None: original_result = original_result_full['genbank_file'] shutil.move( original_result['file_path'], os.path.join(export_package_dir, os.path.basename(original_result['file_path']))) # Make warning file about genes only. warning_filename = "README.txt" with open(os.path.join(export_package_dir, warning_filename), 'w') as temp_file: temp_file.write( 'This directory includes the KBase-derived GenBank file and also ' + '(if you originally uploaded the genome from an annotated ' + 'GenBank file) the original GenBank input.') # package it up and be done dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = {'shock_id': package_details['shock_id']} print('export complete -- result = ') pprint(output) #END export_genome_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]