def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('SetAPI'): cls.cfg[nameval[0]] = nameval[1] authServiceUrl = cls.cfg.get('auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") auth_client = _KBaseAuth(authServiceUrl) user_id = auth_client.get_user(token) # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'SetAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = SetAPI(cls.cfg) # setup data at the class level for now (so that the code is run # once for all tests, not before each test case. Not sure how to # do that outside this function..) suffix = int(time.time() * 1000) wsName = "test_SetAPI_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': wsName}) # wsName = 'pranjan77:1477441032423' cls.wsName = wsName # copy test file to scratch area fna_filename = "seq.fna" fna_path = os.path.join(cls.cfg['scratch'], fna_filename) shutil.copy(os.path.join("data", fna_filename), fna_path) ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) ws_obj_name = 'MyNewAssembly' cls.assembly1ref = ru.save_assembly_from_fasta( { 'file':{'path':fna_path}, 'workspace_name':wsName, 'assembly_name':'assembly_obj_1' }) cls.assembly2ref = ru.save_assembly_from_fasta( { 'file':{'path':fna_path}, 'workspace_name':wsName, 'assembly_name':'assembly_obj_2' })
def test_filter_contigs_by_length_01(self): method = 'filter_contigs_by_length_01' print("\n\nRUNNING: test_filter_contigs_by_length_01()") print("===========================================\n\n") # upload test data try: auClient = AssemblyUtil(self.callback_url, token=self.getContext()['token']) except Exception as e: raise ValueError( 'Unable to instantiate auClient with callbackURL: ' + self.callback_url + ' ERROR: ' + str(e)) ass_file_1 = 'assembly_1.fa' ass_file_2 = 'assembly_2.fa' ass_path_1 = os.path.join(self.scratch, ass_file_1) ass_path_2 = os.path.join(self.scratch, ass_file_2) shutil.copy(os.path.join("data", ass_file_1), ass_path_1) shutil.copy(os.path.join("data", ass_file_2), ass_path_2) ass_ref_1 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_1 }, 'workspace_name': self.getWsName(), 'assembly_name': 'assembly_1' }) ass_ref_2 = auClient.save_assembly_from_fasta({ 'file': { 'path': ass_path_2 }, 'workspace_name': self.getWsName(), 'assembly_name': 'assembly_2' }) # run method input_refs = [ass_ref_1, ass_ref_2] base_output_name = method + '_output' params = { 'workspace_name': self.getWsName(), 'input_assembly_refs': input_refs, 'min_contig_length': 1000, 'output_name': 'test_filtered' } result = self.getImpl().run_filter_contigs_by_length( self.getContext(), params) print('RESULT:') pprint(result) pass
def save_assembly(self, wsname, output_contigs, token, name, console): self.log(console, 'Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=token, service_ver='dev') assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': name })
def getBogusAssembly(self): # Create a fake assembly with lots of contigs assembly_file_name = "bogus.fna" # "AP009048.fna" assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) with open(assembly_temp_file, "w") as f: for i in range(1, 30002): f.write("> contig_%d\n" % i) f.write( "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n" ) assembly_name = "Assembly.2" au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"], token=self.getContext()["token"]) assembly_ref = au.save_assembly_from_fasta({ "file": { "path": assembly_temp_file }, "workspace_name": self.getWsName(), "assembly_name": assembly_name }) self.assembly_ref = assembly_ref print("Uploaded bogus assembly " + str(assembly_ref)) return assembly_ref
def load_test_genome_direct(self, filename, assembly_filename, obj_name): au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_ref = au.save_assembly_from_fasta({ 'workspace_name': self.getWsName(), 'assembly_name': obj_name + '.assembly', 'file': { 'path': assembly_filename } }) pprint('created test assembly: ' + assembly_ref) with open(filename, 'r') as file: data_str = file.read() data = json.loads(data_str) data['assembly_ref'] = assembly_ref # save to ws save_info = { 'workspace': self.getWsName(), 'objects': [{ 'type': 'KBaseGenomes.Genome', 'data': data, 'name': obj_name + '.genome' }] } result = self.ws.save_objects(save_info) info = result[0] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) print('created test genome: ' + ref + ' from file ' + filename) return ref
def test_annotate_contigs(self): assembly_file_name = "small.fna" #"AP009048.fna" assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name) assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) shutil.copy(assembly_test_file, assembly_temp_file) assembly_name = 'Assembly.1' au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 'workspace_name': self.getWsName(), 'assembly_name': assembly_name}) genome_name = "Genome.1" result = self.getImpl().annotate_contigs(self.getContext(), {'assembly_ref': assembly_ref, 'output_workspace': self.getWsName(), 'output_genome_name': genome_name, 'evalue': None, 'fast': 0, 'gcode': None, 'genus': '', 'kingdom': 'Bacteria', 'metagenome': 0, 'mincontiglen': 1, 'norrna': 0, 'notrna': 0, 'rawproduct': 0, 'rfam': 1, 'scientific_name': 'Super : diper - name;' })[0] rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data'] self.assertTrue('text_message' in rep) print("Report:\n" + str(rep['text_message']))
def test_annotate_contigs(self): assembly_file_name = "small.fna" #"AP009048.fna" assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name) assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) shutil.copy(assembly_test_file, assembly_temp_file) assembly_name = 'Assembly.1' au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 'workspace_name': self.getWsName(), 'assembly_name': assembly_name}) # Add a genome to the WS to test ref_paths genome_name = "Genome.1" genome = {'id': 'Unknown', 'features': [], 'scientific_name': "", 'domain': "", 'genetic_code': 0, 'assembly_ref': assembly_ref, 'cdss': [], 'mrnas': [], 'source': 'Magic!', 'gc_content': 0, 'dna_size': 0, 'reference_annotation': 0} prov = self.getContext().provenance() ga = GenomeAnnotationAPI(os.environ['SDK_CALLBACK_URL']) info = ga.save_one_genome_v1( {'workspace': self.getWsName(), 'name': genome_name, 'data': genome, 'provenance': prov})['info'] genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) result = self.getImpl().annotate_contigs(self.getContext(), {'assembly_ref': "{};{}".format(genome_ref, assembly_ref), 'output_workspace': self.getWsName(), 'output_genome_name': genome_name, 'evalue': None, 'fast': 0, 'gcode': 0, 'genus': 'genus', 'kingdom': 'Bacteria', 'metagenome': 0, 'mincontiglen': 1, 'norrna': 0, 'notrna': 0, 'rawproduct': 0, 'rfam': 1, 'scientific_name': 'Super : diper - name;' })[0] rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data'] self.assertTrue('text_message' in rep) print("Report:\n" + str(rep['text_message'])) genome_ref = self.getWsName() + "/" + genome_name genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data'] features_to_work = {} for feature in genome['features']: features_to_work[feature['id']] = feature['location'] aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 'assembly_ref': genome['assembly_ref']})['dna_sequences'] bad_dnas = 0 for feature in genome['features']: if feature['dna_sequence'] != dna_sequences[feature['id']]: bad_dnas += 1 self.assertEqual(bad_dnas, 0)
def load_fasta_file(self, filename, obj_name, contents): f = open(filename, 'w') f.write(contents) f.close() assemblyUtil = AssemblyUtil(self.callback_url) assembly_ref = assemblyUtil.save_assembly_from_fasta({'file': {'path': filename}, 'workspace_name': self.getWsName(), 'assembly_name': obj_name }) return assembly_ref
def load_fasta_file(self, path, name): assembly_util = AssemblyUtil(self.callback_url) return assembly_util.save_assembly_from_fasta({ 'file': { 'path': path }, 'workspace_name': self.getWsName(), 'assembly_name': name })
def loadAssembly(self): if hasattr(self.__class__, 'assembly_ref'): return self.__class__.assembly_ref fasta_path = os.path.join(self.scratch, 'test.fna') shutil.copy(os.path.join('data', 'test.fna'), fasta_path) au = AssemblyUtil(self.callback_url) assembly_ref = au.save_assembly_from_fasta({'file': {'path': fasta_path}, 'workspace_name': self.getWsName(), 'assembly_name': 'test_assembly' }) self.__class__.assembly_ref = assembly_ref return assembly_ref
def get_fasta_file(self, filename, obj_name): assemblyUtil = AssemblyUtil(self.callback_url) assembly_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filename }, 'workspace_name': self.getWsName(), 'assembly_name': obj_name }) return assembly_ref
def loadFasta2Assembly(self, filename): fn, ext = os.path.splitext(filename) fasta_path = os.path.join(self.scratch, filename) shutil.copy(os.path.join('../testReads', filename), fasta_path) au = AssemblyUtil(self.callback_url) a_ref = au.save_assembly_from_fasta({ 'file': { 'path': fasta_path }, 'workspace_name': self.getWsName(), 'assembly_name': fn }) return a_ref
def get_genome_ref(self, ws_name, tf='ecoliMG1655.fa'): if hasattr(self.__class__, 'genomeInfo'): return self.__class__.genomeInfo au = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) target = os.path.join(self.scratch, tf) self.genome_path = target shutil.copy('data/' + tf, target) self.__class__.genomeInfo = au.save_assembly_from_fasta({ 'file': { 'path': target }, 'workspace_name': ws_name, 'assembly_name': tf.split('.fa')[0] }) return self.__class__.genomeInfo
def load_fasta_file(self, filename, obj_name, contents): f = open(filename, 'w') # TODO make this use the data folder (not sure of relative path) f.write(contents) f.close() assemblyUtil = AssemblyUtil(self.callback_url) # TODO why does this next line take forevverr assembly_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filename }, 'workspace_name': self.getWsName(), 'assembly_name': obj_name }) return assembly_ref
def load_fasta_file(callback_url, ws_name, filename, obj_name, contents): """ Loads the given FASTA file into a workspace as an Assembly object. """ f = open(filename, 'w') f.write(contents) f.close() assembly_util = AssemblyUtil(callback_url) assembly_ref = assembly_util.save_assembly_from_fasta({ 'file': { 'path': filename }, 'workspace_name': ws_name, 'assembly_name': obj_name }) return assembly_ref
def test_annotate_contigs(self): assembly_file_name = "small.fna" #"AP009048.fna" assembly_test_file = os.path.join("/kb/module/test/data", assembly_file_name) assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) shutil.copy(assembly_test_file, assembly_temp_file) assembly_name = 'Assembly.1' au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 'workspace_name': self.getWsName(), 'assembly_name': assembly_name}) genome_name = "Genome.1" result = self.getImpl().annotate_contigs(self.getContext(), {'assembly_ref': assembly_ref, 'output_workspace': self.getWsName(), 'output_genome_name': genome_name, 'evalue': None, 'fast': 0, 'gcode': 0, 'genus': 'genus', 'kingdom': 'Bacteria', 'metagenome': 0, 'mincontiglen': 1, 'norrna': 0, 'notrna': 0, 'rawproduct': 0, 'rfam': 1, 'scientific_name': 'Super : diper - name;' })[0] rep = self.getWsClient().get_objects([{'ref': result['report_ref']}])[0]['data'] self.assertTrue('text_message' in rep) print("Report:\n" + str(rep['text_message'])) genome_ref = self.getWsName() + "/" + genome_name genome = self.getWsClient().get_objects([{'ref': genome_ref}])[0]['data'] features_to_work = {} for feature in genome['features']: features_to_work[feature['id']] = feature['location'] aseq = AssemblySequenceAPI(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) dna_sequences = aseq.get_dna_sequences({'requested_features': features_to_work, 'assembly_ref': genome['assembly_ref']})['dna_sequences'] bad_dnas = 0 for feature in genome['features']: if feature['dna_sequence'] != dna_sequences[feature['id']]: bad_dnas += 1 self.assertEqual(bad_dnas, 0)
def loadAssembly(self): if hasattr(self.__class__, 'assembly_ref'): return self.__class__.assembly_ref # return '23735/1/1' fasta_path = os.path.join(self.scratch, 'test_ref.fa') shutil.copy(os.path.join('data', 'bt_test_data', 'test_ref.fa'), fasta_path) au = AssemblyUtil(self.callback_url) assembly_ref = au.save_assembly_from_fasta({ 'file': { 'path': fasta_path }, 'workspace_name': self.getWsName(), 'assembly_name': 'test_assembly' }) self.__class__.assembly_ref = assembly_ref print('Loaded Assembly: ' + assembly_ref) return assembly_ref
def loadAssembly(self): if hasattr(self.__class__, 'assembly_ref'): return self.__class__.assembly_ref fasta_path = os.path.join(self.scratch, 'star_test_assembly.fa') #shutil.copy(os.path.join('../work/testReads', 'test_reference.fa'), fasta_path) shutil.copy( os.path.join('../work/testReads', 'Arabidopsis_thaliana.TAIR10.dna.toplevel.fa'), fasta_path) au = AssemblyUtil(self.callback_url) assembly_ref = au.save_assembly_from_fasta({ 'file': { 'path': fasta_path }, 'workspace_name': self.getWsName(), 'assembly_name': 'star_test_assembly' }) self.__class__.assembly_ref = assembly_ref print('Loaded Assembly: ' + assembly_ref) return assembly_ref
def upload_assembly(self, file_path, workspace_name, assembly_name): """ From a list of file paths, uploads them to KBase, generates Assembly objects, then returns the generated UPAs. """ if not file_path: raise ValueError("file_path must be defined") if not os.path.exists(file_path): raise ValueError("The given assembly file '{}' does not exist".format(file_path)) if not workspace_name: raise ValueError("workspace_name must be defined") if not assembly_name: raise ValueError("assembly_name must be defined") au = AssemblyUtil(self.callback_url) assembly_upa = au.save_assembly_from_fasta({ "file": { "path": file_path }, "workspace_name": workspace_name, "assembly_name": assembly_name }) return assembly_upa
def test_annotate_contigs_too_big(self): """ simulate a metagenome contig file """ # Create a fake assembly with lots of contigs assembly_file_name = "bogus.fna" #"AP009048.fna" assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name) with open(assembly_temp_file, 'w') as f: for i in range(1,30002): f.write('> contig_%d\n' % i) f.write('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n') assembly_name = 'Assembly.2' au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=self.getContext()['token']) assembly_ref = au.save_assembly_from_fasta({'file': {'path': assembly_temp_file}, 'workspace_name': self.getWsName(), 'assembly_name': assembly_name}) genome_name = "Genome.1" # This should fail with an error with self.assertRaises(ValueError): result = self.getImpl().annotate_contigs(self.getContext(), {'assembly_ref': assembly_ref, 'output_workspace': self.getWsName(), 'output_genome_name': genome_name, 'evalue': None, 'fast': 0, 'gcode': 0, 'genus': 'genus', 'kingdom': 'Bacteria', 'metagenome': 0, 'mincontiglen': 1, 'norrna': 0, 'notrna': 0, 'rawproduct': 0, 'rfam': 1, 'scientific_name': 'Super : diper - name;' })
def filter_contigs(self, ctx, params): """ The actual function is declared using 'funcdef' to specify the name and input/return arguments to the function. For all typical KBase Apps that run in the Narrative, your function should have the 'authentication required' modifier. :param params: instance of type "FilterContigsParams" (A 'typedef' can also be used to define compound or container objects, like lists, maps, and structures. The standard KBase convention is to use structures, as shown here, to define the input and output of your function. Here the input is a reference to the Assembly data object, a workspace to save output, and a length threshold for filtering. To define lists and maps, use a syntax similar to C++ templates to indicate the type contained in the list or map. For example: list <string> list_of_strings; mapping <string, int> map_of_ints;) -> structure: parameter "assembly_input_ref" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "workspace_name" of String, parameter "min_length" of Long :returns: instance of type "FilterContigsResults" (Here is the definition of the output of the function. The output can be used by other SDK modules which call your code, or the output visualizations in the Narrative. 'report_name' and 'report_ref' are special output fields- if defined, the Narrative can automatically render your Report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "assembly_output" of type "assembly_ref" (A 'typedef' allows you to provide a more specific name for a type. Built-in primitive types include 'string', 'int', 'float'. Here we define a type named assembly_ref to indicate a string that should be set to a KBase ID reference to an Assembly data object.), parameter "n_initial_contigs" of Long, parameter "n_contigs_removed" of Long, parameter "n_contigs_remaining" of Long """ # ctx is the context object # return variables are: output #BEGIN filter_contigs # Print statements to stdout/stderr are captured and available as the App log print('Starting Filter Contigs function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError( 'Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError( 'Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError( 'Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError( 'Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta( {'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system print('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': filtered_fasta_file }, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{ 'ref': new_assembly, 'description': 'Filtered contigs' }], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({ 'report': reportObj, 'workspace_name': params['workspace_name'] }) # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } print('returning:' + pformat(output)) #END filter_contigs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'output is not type dict as required.') # return the results return [output]
def arast_run(self, ctx, params, assembler, server='http://localhost:8000'): output = None console = [] self.log(console, 'Running run_{} with params='.format(assembler)) self.log(console, pformat(params)) #### do some basic checks if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_refs' not in params and 'read_library_names' not in params: raise ValueError( 'read_library_refs or read_library_names parameter is required' ) if 'read_library_refs' in params: if type(params['read_library_refs']) != list: raise ValueError('read_library_refs must be a list') if 'read_library_names' in params: if type(params['read_library_names']) != list: raise ValueError('read_library_names must be a list') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') min_contig_len = params.get('min_contig_len') or 300 token = ctx['token'] os.environ["KB_AUTH_TOKEN"] = token os.environ["ARAST_URL"] = server ws = workspaceService(self.workspaceURL) ws_libs = [] if 'read_library_refs' in params: for lib_ref in params['read_library_refs']: ws_libs.append({'ref': lib_ref}) if 'read_library_names' in params: for lib_name in params['read_library_names']: ws_libs.append( {'ref': params['workspace_name'] + '/' + lib_name}) if len(ws_libs) == 0: raise ValueError( 'At least one read library must be provided in read_library_refs or read_library_names' ) libs = ws.get_objects2({'objects': ws_libs})['data'] wsid = libs[0]['info'][6] kbase_assembly_input = self.combine_read_libs(libs) tmp_data = self.create_temp_json(kbase_assembly_input) mode = '' cmd = ['ar-run', '--data-json', tmp_data] if assembler: cmd = cmd + ['-a', assembler] mode = 'assembler: ' + assembler elif 'pipeline' in params and params['pipeline']: cmd = cmd + ['-p', params['pipeline']] mode = 'assembly pipeline: ' + params['pipeline'] else: cmd = cmd + ['-r', params.get('recipe', 'auto')] mode = 'assembly recipe: ' + params['recipe'] logger.info('Start {}'.format(mode)) logger.debug('CMD: {}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logger.debug(out) if p.returncode != 0: raise ValueError('Error running ar_run, return code: {}\n'.format( p.returncode)) job_id = None match = re.search('(\d+)', out) if match: job_id = match.group(1) else: raise ValueError('No integer job ID found: {}\n'.format(out)) timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) output_contigs = os.path.join(output_dir, 'contigs.fa') if not os.path.exists(output_dir): os.makedirs(output_dir) cmd = ['ar-get', '-j', job_id, '-w', '-l'] logger.debug('CMD: {}'.format(' '.join(cmd))) ar_log = subprocess.check_output(cmd) self.log(console, ar_log) cmdstr = 'ar-get -j {} -w -p | ar-filter -l {} > {}'.format( job_id, min_contig_len, output_contigs) logger.debug('CMD: {}'.format(cmdstr)) subprocess.check_call(cmdstr, shell=True) cmd = ['ar-get', '-j', job_id, '-w', '-r'] logger.debug('CMD: {}'.format(' '.join(cmd))) ar_report = subprocess.check_output(cmd) self.log(console, "\nDONE\n") client = AssemblyUtil(self.callback_url) assembly_ref = client.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference if 'read_library_names' in params: provenance[0]['input_ws_objects'] = [ params['workspace_name'] + '/' + x for x in params['read_library_names'] ] elif 'read_library_refs' in params: provenance[0]['input_ws_objects'] = [ x for x in params['read_library_refs'] ] os.remove(tmp_data) #shutil.rmtree(output_dir) # create a Report report = '' report += '============= Raw Contigs ============\n' + ar_report + '\n' report += '========== Filtered Contigs ==========\n' report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Average Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print report reportObj = { 'objects_created': [{ 'ref': params['workspace_name'] + '/' + params['output_contigset_name'], 'description': 'Assembled contigs' }], 'text_message': report } reportName = '{}.report.{}'.format(assembler, job_id) report_obj_info = ws.save_objects({ 'id': wsid, 'objects': [{ 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, 'provenance': provenance }] })[0] output = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) } # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'returnVal is not type dict as required.') # return the results return output
def filter_contigs(self, ctx, params): """ :param workspace_name: instance of String :param params: instance of type "ContigFilterParams" (Input parameters) -> structure: parameter "assembly_ref" of String, parameter "min_length" of Long :returns: instance of type "ContigFilterResults" (Output results) -> structure: parameter "report_name" of String, parameter "report_ref" of String, parameter "filtered_assembly_ref" of String, parameter "n_total" of Long, parameter "n_remaining" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs for name in ['min_length', 'assembly_ref', 'workspace_name']: if name not in params: raise ValueError('Parameter "' + name + '" is required but missing') if not isinstance(params['min_length'], int) or (params['min_length'] < 0): raise ValueError('Min length must be a non-negative integer') if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']): raise ValueError('Pass in a valid assembly reference string') ws_name = params['workspace_name'] assembly_util = AssemblyUtil(self.callback_url) file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']}) # Parse the downloaded file in FASTA format parsed_assembly = SeqIO.parse(file['path'], 'fasta') min_length = params['min_length'] # Keep a list of contigs greater than min_length good_contigs = [] # total contigs regardless of length n_total = 0 # total contigs over the min_length n_remaining = 0 for record in parsed_assembly: n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 # Create a file to hold the filtered data filtered_path = os.path.join(self.scratch, 'filtered.fasta') SeqIO.write(good_contigs, filtered_path, 'fasta') # Upload the filtered data to the workspace new_ref = assembly_util.save_assembly_from_fasta({ 'file': {'path': filtered_path}, 'workspace_name': ws_name, 'assembly_name': file['assembly_name'] }) # Create an output summary message for the report text_message = "".join([ 'Filtered assembly to ', str(n_remaining), ' contigs out of ', str(n_total) ]) # Data for creating the report, referencing the assembly we uploaded html_dir = os.path.join(self.scratch, 'html') html_index_path = os.path.join(html_dir, 'index.html') file_path = os.path.join(self.scratch, 'myfile.txt') with open(file_path, 'w') as f: f.write('hello world') os.mkdir(html_dir) with open(html_index_path, 'w') as f: f.write('<p><b>hello world</b></p>') print('xyz1', os.listdir(html_dir)) print('xyz2', os.listdir(self.scratch)) html_links = [{ 'path': os.path.join(html_dir, 'index.html'), 'name': 'main.html', 'description': 'Sample description' }] file_links = [{ 'path': file_path, 'name': 'file.txt', 'description': 'Sample file description' }] + html_links # Extended report report_data = { 'objects_created': [{'ref': new_ref, 'description': 'Filtered contigs'}], 'html_links': html_links, 'file_links': file_links, 'warnings': ['warning 1', 'warning 2'], 'report_object_name': 'my_report', 'direct_html': '<p>Hello</p>', 'message': text_message, 'workspace_name': ws_name, 'direct_html_link_index': 0, 'html_window_height': 800, 'summary_window_height': 800 } # # Simple report # report_data = { # 'report': { # 'text_message': 'My simple report text message', # 'warnings': ['warning 1', 'warning 2'], # 'objects_created': [{'ref': new_ref, 'description': 'filtered contigs'}] # }, # 'workspace_name': ws_name # } # Initialize the report kbase_report = KBaseReport(self.callback_url) report = kbase_report.create_extended_report(report_data) # Return the report reference and name in our results returnVal = { 'report_ref': report['ref'], 'report_name': report['name'], 'n_total': n_total, 'n_remaining': n_remaining, 'filtered_assembly_ref': new_ref } #END filter_contigs # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_contigs return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def arast_run(self, ctx, params, assembler, server='http://localhost:8000'): output = None console = [] self.log(console,'Running run_{} with params='.format(assembler)) self.log(console, pformat(params)) #### do some basic checks if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_refs' not in params and 'read_library_names' not in params: raise ValueError('read_library_refs or read_library_names parameter is required') if 'read_library_refs' in params: if type(params['read_library_refs']) != list: raise ValueError('read_library_refs must be a list') if 'read_library_names' in params: if type(params['read_library_names']) != list: raise ValueError('read_library_names must be a list') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') min_contig_len = params.get('min_contig_len') or 300 token = ctx['token'] os.environ["KB_AUTH_TOKEN"] = token os.environ["ARAST_URL"] = server ws = workspaceService(self.workspaceURL) ws_libs = [] if 'read_library_refs' in params: for lib_ref in params['read_library_refs']: ws_libs.append({'ref': lib_ref}) if 'read_library_names' in params: for lib_name in params['read_library_names']: ws_libs.append({'ref': params['workspace_name'] + '/' + lib_name}) if len(ws_libs)==0: raise ValueError('At least one read library must be provided in read_library_refs or read_library_names') libs = ws.get_objects2({'objects': ws_libs})['data'] wsid = libs[0]['info'][6] kbase_assembly_input = self.combine_read_libs(libs) tmp_data = self.create_temp_json(kbase_assembly_input) mode = '' cmd = ['ar-run', '--data-json', tmp_data] if assembler: cmd = cmd + ['-a', assembler] mode = 'assembler: ' + assembler elif 'pipeline' in params and params['pipeline']: cmd = cmd + ['-p', params['pipeline']] mode = 'assembly pipeline: ' + params['pipeline'] else: cmd = cmd + ['-r', params.get('recipe', 'auto')] mode = 'assembly recipe: ' + params['recipe'] logger.info('Start {}'.format(mode)) logger.debug('CMD: {}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) out, err = p.communicate() logger.debug(out) if p.returncode != 0: raise ValueError('Error running ar_run, return code: {}\n'.format(p.returncode)) job_id = None match = re.search('(\d+)', out) if match: job_id = match.group(1) else: raise ValueError('No integer job ID found: {}\n'.format(out)) timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000) output_dir = os.path.join(self.scratch, 'output.'+str(timestamp)) output_contigs = os.path.join(output_dir, 'contigs.fa') if not os.path.exists(output_dir): os.makedirs(output_dir) cmd = ['ar-get', '-j', job_id, '-w', '-l'] logger.debug('CMD: {}'.format(' '.join(cmd))) ar_log = subprocess.check_output(cmd) self.log(console, ar_log) cmdstr = 'ar-get -j {} -w -p | ar-filter -l {} > {}'.format(job_id, min_contig_len, output_contigs) logger.debug('CMD: {}'.format(cmdstr)) subprocess.check_call(cmdstr, shell=True) cmd = ['ar-get', '-j', job_id, '-w', '-r'] logger.debug('CMD: {}'.format(' '.join(cmd))) ar_report = subprocess.check_output(cmd) self.log(console, "\nDONE\n") client = AssemblyUtil(self.callback_url) assembly_ref = client.save_assembly_from_fasta({ 'file':{'path':output_contigs}, 'workspace_name':params['workspace_name'], 'assembly_name':params['output_contigset_name'] }) lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference if 'read_library_names' in params: provenance[0]['input_ws_objects']=[params['workspace_name']+'/'+x for x in params['read_library_names']] elif 'read_library_refs' in params: provenance[0]['input_ws_objects']=[x for x in params['read_library_refs']] os.remove(tmp_data) #shutil.rmtree(output_dir) # create a Report report = '' report += '============= Raw Contigs ============\n' + ar_report + '\n' report += '========== Filtered Contigs ==========\n' report += 'ContigSet saved to: '+params['workspace_name']+'/'+params['output_contigset_name']+'\n' report += 'Assembled into '+str(len(lengths)) + ' contigs.\n' report += 'Average Length: '+str(sum(lengths)/float(len(lengths))) + ' bp.\n' # compute a simple contig length distribution bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' '+str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c+1]) + ' bp\n' print report reportObj = { 'objects_created':[{'ref':params['workspace_name']+'/'+params['output_contigset_name'], 'description':'Assembled contigs'}], 'text_message': report } reportName = '{}.report.{}'.format(assembler, job_id) report_obj_info = ws.save_objects({ 'id': wsid, 'objects': [ { 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, 'provenance': provenance } ] })[0] output = { 'report_name': reportName, 'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4]) } # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'returnVal is not type dict as required.') # return the results return output
class kb_virsorterTest(unittest.TestCase): @classmethod def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) user_id = requests.post( 'https://kbase.us/services/authorization/Sessions/Login', data='token={}&fields=user_id'.format(token)).json()['user_id'] # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'kb_virsorter', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('kb_virsorter'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = kb_virsorter(cls.cfg) cls.testobjref = [] #cls.testobjdata = [] cls.testwsname = [] @classmethod def tearDownClass(cls): if hasattr(cls, 'wsName'): cls.wsClient.delete_workspace({'workspace': cls.wsName}) print('Test workspace was deleted') if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0: try: print('Deleting workspace 2 ' + cls.testwsname[0]) cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]}) print('Test workspace 2 was deleted ' + cls.testwsname[0]) except Exception as e: print e #if hasattr(cls, 'testobjdata'): # try: # print('Deleting shock data ' + str(len(cls.testobjdata))) # print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0]))) # print('Deleting shock data ' + str(cls.testobjdata[0])) # node = cls.testobjdata[0]['data'][0]['lib']['file']['id'] # cls.delete_shock_node(node) # print('Test shock data was deleted') # except Exception as e: # print e def getWsClient(self): return self.__class__.wsClient def getWsName(self): if hasattr(self.__class__, 'wsName'): return self.__class__.wsName suffix = int(time.time() * 1000) wsName = "test_kb_virsorter_" + str(suffix) ret = self.getWsClient().create_workspace({'workspace': wsName}) self.__class__.wsName = wsName return wsName def getImpl(self): return self.__class__.serviceImpl def getContext(self): return self.__class__.ctx def write_file(self, filename, content): tmp_dir = self.cfg['scratch'] file_path = os.path.join(tmp_dir, filename) with open(file_path, 'w') as fh1: fh1.write(content) return file_path def delete_shock_node(self, node_id): header = {'Authorization': 'Oauth {0}'.format(cls.token)} requests.delete(cls.shockURL + '/node/' + node_id, headers=header, allow_redirects=True) def ztest_aaa_upload_to_shock(self): print "upload ref data to shock staging" self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL']) #file_path = self.write_file('Phage_gene_catalog.tar.gz', 'Test') input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz' source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name) tmp_dir = self.cfg['scratch'] target_file_path = os.path.join(tmp_dir, input_file_name) print "file_path " + source_file_path+"\t"+target_file_path orig_size = os.path.getsize(source_file_path) shutil.copy(source_file_path, target_file_path) print "Testing "+target_file_path print(os.path.isfile(target_file_path)) ret1 = self.dfUtil.file_to_shock( {'file_path': target_file_path}) print str(ret1) shock_id = ret1['shock_id'] print "shock_id "+shock_id file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz') #ret2 = self.dfUtil.shock_to_file( # {'shock_id': shock_id, 'file_path': file_path2})[0] ret2 = self.dfUtil.shock_to_file( {'shock_id': shock_id, 'file_path': file_path2}) print(ret2) file_name = ret2['node_file_name'] attribs = ret2['attributes'] self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz') self.assertEqual(ret2['file_path'], file_path2) self.assertEqual(ret2['size'], orig_size) self.assertIsNone(attribs) #self.delete_shock_node(shock_id) def create_random_string(self): N = 20 return ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N)) def test_virsorter_ok(self): self.upload_assembly() if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_reads self.testwsname[0] " + self.testwsname[0] #try: # ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) # test_ws_name #except Exception as e: # # print "ERROR" # # print(type(e)) # # print(e.args) # print(e) # pass print "self.testwsname "+ str(self.testwsname) params = {} params['assembly_ref'] = str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref params['ws_name'] = self.testwsname[0] result = self.getImpl().run_virsorter(self.getContext(), params) print('RESULT run_virsorter:') pprint(result) #testresult = [ # {'blah': 'blah', 'bleh': 'bleh'}] testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}] self.assertEqual(sorted(result), sorted(testresult)) def upload_assembly(self): if not self.testobjref: print "upload_assembly start" indata = 'U00096.2.fa'#_first1000. ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata) print "ftarget " + ftarget ret = shutil.copy('../test_data/' + indata, ftarget) #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL']) self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL']) if not self.testwsname: self.testwsname.append(self.create_random_string()) print "upload_assembly self.testwsname[0] " + self.testwsname[0] try: ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]}) #test_ws_name except Exception as e: #print "ERROR" #print(type(e)) #print(e.args) print(e) pass try: print "attempt upload" print "ftarget " + ftarget ref = self.assemblyUtilClient.save_assembly_from_fasta( { 'workspace_name': self.testwsname[0], 'assembly_name': 'Ecolik12MG1655', 'file': {'path': ftarget}}) print "upload_assembly" print ref #self.testobjref = [] self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1') #self.testobjdata = [] #self.testobjdata.append(self.dfu.get_objects( # {'object_refs': [self.testobjref[0]]})) ##print self.testobjdata[0] except Exception as e: print e pass print "self.testobjref[0]" print self.testobjref print self.testobjref[0]
def run_hipmer_hpc(self, ctx, params): """ :param params: instance of type "AssemblyParams" (Run assembler workspace_name - the name of the workspace for input/output read_library_name - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset extra_params - assembler specific parameters min_contig_length - minimum length of contigs to output, default 200 @optional min_contig_len @optional extra_params) -> structure: parameter "workspace_name" of String, parameter "read_library_name" of String, parameter "output_contigset_name" of String, parameter "min_contig_len" of Long, parameter "extra_params" of list of String :returns: instance of type "AssemblyOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_hipmer_hpc console = [] self.log(console, 'Running run_hipmer_hpc with params=') self.log(console, pformat(params)) # Validate parameters. This will raise an error if there # is a problem. self._validate_inputs(params) ws_name = params['workspace_name'] #ws = workspaceService(self.workspaceURL, token=ctx['token']) if 'POST' not in os.environ: # Get the read library print "Running pre stage" refs = [] for read in params['reads']: read_name = read['read_library_name'] if '/' in read_name: ref = read_name else: ref = ws_name + '/' + read_name refs.append(ref) read['ref'] = ref if not self.check_reads(ctx, refs, console): raise ValueError('The reads failed validation\n') params['readsfiles'] = self.get_reads_RU(ctx, refs, console) self.fixup_reads(params) # Generate submit script ts = self.generate_config(params) self.generate_submit(ts) return print "Running POST stage" # run hipmer, capture output as it happens self.log(console, 'running hipmer:') output_contigs = os.path.join(self.scratch, 'results', 'final_assembly.fa') output_name = params['output_contigset_name'] if not os.path.exists(output_contigs): print "It looks like HipMER failed for some reason." print "Show errors in log file" logfile = '' for fn in os.listdir('.'): if fn.startswith('slurm-'): logfile = fn if logfile != '': with open(logfile, 'r') as f: for line in f: if line.lower().find('error') >= 0: print line raise RuntimeError("Error in HipMER execution") wsname = params['workspace_name'] #output_data_ref = self.save_assembly(wsname, # output_contigs, # ctx['token'], # output_name, # console) self.log(console, 'Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='dev') save_input = { 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': output_name } output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input) # create a Report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' report += params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' \%d\t--\t%d' % (counts[c], edges[c]) report += ' to %d bp\n' % (edges[c + 1]) print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({ 'files': [{ 'path': output_contigs, 'label': params['output_contigset_name'] }] }) except QUASTError as qe: # not really any way to test this, all inputs have been checked # earlier and should be ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report({ 'message': report, 'objects_created': [{ 'ref': output_data_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except _RepError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } return [output] #END run_hipmer_hpc # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_hipmer_hpc return value ' + 'output is not type dict as required.') # return the results return [output]
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 min_k - minimum kmer size (<= 127), must be odd number, default 21 max_k - maximum kmer size (<= 127), must be odd number, default 99 k_step - increment of kmer size of each iteration (<= 28), must be even number, default 10 k_list - list of kmer size (all must be odd, in the range 15-127, increment <= 28); override `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = { 'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError( 'min_contig_length parameter must be a non-negative integer' ) megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # set the number of cpus megahit_cmd.append('--num-cpu-threads') megahit_cmd.append(str(multiprocessing.cpu_count() - 1)) # set the output location timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: raise ValueError('Error running MEGAHIT, return code: ' + str(retcode) + '\n') output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({ 'files': [{ 'path': output_contigs, 'label': params['output_contigset_name'] }] }) except QUASTError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report({ 'message': report, 'objects_created': [{ 'ref': output_data_ref, 'description': 'Assembled contigs' }], 'direct_html_link_index': 0, 'html_links': [{ 'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report' }], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except _RepError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def run_A5(self, ctx, params): """ Run A5 on paired end libraries :param params: instance of type "A5_Params" (Input parameters for running A5. workspace_name - the name of the workspace from which to take input and store output. output_contigset_name - the name of the output contigset libfile_args - parameters for each input paired end reads min_contig_length - minimum length of contigs in the assembly output metagenome - metagenome option to A5 @optional min_contig_length @optional metagenome) -> structure: parameter "workspace_name" of String, parameter "output_contigset_name" of String, parameter "libfile_args" of list of type "libfile_args_type" (Parameters for a paired end library entry in the input 'libfile') -> structure: parameter "libfile_library" of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "libfile_unpaired" of String, parameter "libfile_insert" of Long, parameter "min_contig_length" of Long, parameter "metagenome" of type "bool" (A boolean - 0 for false, 1 for true. @range (0, 1)) :returns: instance of type "A5_Output" (Output parameters for A5 run. string report_name - the name of the KBaseReport.Report workspace object. string report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_A5 print("=================== IN run_A5") # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_A5 with params:\n' + pformat(params)) # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) pprint(params) token = ctx['token'] # get absolute refs from ws wsname = params[self.PARAM_IN_WS] print("Workspace name: " + wsname) # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) outdir = os.path.join(self.scratch, 'A5_dir' + str(timestamp)) reads = self.get_input_reads(params, token) libFile = self.generate_libfile(params[self.PARAM_IN_LIBFILE_ARGS], reads, outdir) a5_output_prefix = params[self.PARAM_IN_CS_NAME] self.exec_A5(libFile, params, outdir) self.log('A5 output dir: ' + a5_output_prefix) # parse the output and save back to KBase output_contigs = os.path.join(outdir, a5_output_prefix + ".contigs.fasta") min_contig_len = 0 if self.PARAM_IN_MIN_CONTIG in params and params[self.PARAM_IN_MIN_CONTIG] is not None: if (int(params[self.PARAM_IN_MIN_CONTIG])) > 0: min_contig_len = int(params[self.PARAM_IN_MIN_CONTIG]) self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='dev') assemblyUtil.save_assembly_from_fasta({'file': {'path': output_contigs}, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': min_contig_len }) report_name, report_ref = self.load_report(output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref } #END run_A5 # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_A5 return value ' + 'output is not type dict as required.') # return the results return [output]
def filter_contigs(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN filter_contigs # Print statements to stdout/stderr are captured and available as the App log print('Starting Filter Contigs function. Params=') pprint(params) # Step 1 - Parse/examine the parameters and catch any errors # It is important to check that parameters exist and are defined, and that nice error # messages are returned to users. Parameter values go through basic validation when # defined in a Narrative App, but advanced users or other SDK developers can call # this function directly, so validation is still important. print('Validating parameters.') if 'workspace_name' not in params: raise ValueError('Parameter workspace_name is not set in input arguments') workspace_name = params['workspace_name'] if 'assembly_input_ref' not in params: raise ValueError('Parameter assembly_input_ref is not set in input arguments') assembly_input_ref = params['assembly_input_ref'] if 'min_length' not in params: raise ValueError('Parameter min_length is not set in input arguments') min_length_orig = params['min_length'] min_length = None try: min_length = int(min_length_orig) except ValueError: raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')') if min_length < 0: raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')') # Step 2 - Download the input data as a Fasta and # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object. # The return object gives us the path to the file that was created. print('Downloading Assembly data as a Fasta file.') assemblyUtil = AssemblyUtil(self.callback_url) fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref}) # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file. # We can use BioPython to parse the Fasta file and build and save the output to a file. good_contigs = [] n_total = 0 n_remaining = 0 for record in SeqIO.parse(fasta_file['path'], 'fasta'): n_total += 1 if len(record.seq) >= min_length: good_contigs.append(record) n_remaining += 1 print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)) filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta') SeqIO.write(good_contigs, filtered_fasta_file, 'fasta') # Step 4 - Save the new Assembly back to the system print('Uploading filtered Assembly data.') new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file}, 'workspace_name': workspace_name, 'assembly_name': fasta_file['assembly_name'] }) # Step 5 - Build a Report and return reportObj = { 'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}], 'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total) } report = KBaseReport(self.callback_url) report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']}) # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref'], 'assembly_output': new_assembly, 'n_initial_contigs': n_total, 'n_contigs_removed': n_total - n_remaining, 'n_contigs_remaining': n_remaining } print('returning:' + pformat(output)) #END filter_contigs # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method filter_contigs return value ' + 'output is not type dict as required.') # return the results return [output]
class ImportAssemblyUtil: def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.scratch = os.path.join(config['scratch'], 'import_assembly_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.token = config['KB_AUTH_TOKEN'] self.dfu = DataFileUtil(self.callback_url) self.au = AssemblyUtil(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_fasta_as_assembly_from_staging(self, params): ''' import_fasta_as_assembly_from_staging: wrapper method for AssemblyUtil.save_assembly_from_fasta required params: staging_file_subdir_path - subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name assembly_name - output Assembly file name workspace_name - the name of the workspace it gets saved to. return: obj_ref: return object reference ''' log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_fasta_as_assembly_from_staging(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') file = {'path': scratch_file_path} import_assembly_params = params import_assembly_params['file'] = file ref = self.au.save_assembly_from_fasta(import_assembly_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), ref) returnVal = {'obj_ref': ref} return returnVal def validate_import_fasta_as_assembly_from_staging(self, params): """ validate_import_fasta_as_assembly_from_staging: validates params passed to import_fasta_as_assembly_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'workspace_name', 'assembly_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') def generate_html_report(self, assembly_ref, assembly_object, params): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() assembly_data = assembly_object.get('data')[0].get('data') assembly_info = assembly_object.get('data')[0].get('info') result_file_path = os.path.join(self.scratch, 'report.html') assembly_name = str(assembly_info[1]) assembly_file = params.get('staging_file_subdir_path') dna_size = assembly_data.get('dna_size') num_contigs = assembly_data.get('num_contigs') assembly_overview_data = collections.OrderedDict() assembly_overview_data['Name'] = '{} ({})'.format( assembly_name, assembly_ref) assembly_overview_data['Uploaded File'] = assembly_file assembly_overview_data['Date Uploaded'] = time.strftime("%c") assembly_overview_data['DNA Size'] = dna_size assembly_overview_data['Number of Contigs'] = num_contigs overview_content = '' overview_content += '<br/><table>\n' for key, val in assembly_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>\n' overview_content += '</table>' contig_data = assembly_data.get('contigs').values() contig_content = str([[str(e['contig_id']), e['length']] for e in contig_data]) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_assembly.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>*Overview_Content*</p>', overview_content) report_template = report_template.replace( '*CONTIG_DATA*', contig_content) result_file.write(report_template) result_file.close() report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_fasta_as_assembly_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) objects_created = [{ 'ref': obj_ref, 'description': 'Imported Assembly' }] output_html_files = self.generate_html_report(obj_ref, object_data, params) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 270, 'report_object_name': 'kb_upload_assembly_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output
def run_idba_ud(self, ctx, params): """ Run IDBA on paired end libraries :param params: instance of type "idba_ud_Params" (Input parameters for running idba_ud. string workspace_name - the name of the workspace from which to take input and store output. list<paired_end_lib> read_libraries - Illumina PairedEndLibrary files to assemble. string output_contigset_name - the name of the output contigset min_contig_length - minimum length of contigs to output, default is 2000 @optional kval_args) -> structure: parameter "workspace_name" of String, parameter "read_libraries" of list of type "paired_end_lib" (The workspace object name of a PairedEndLibrary file, whether of the KBaseAssembly or KBaseFile type.), parameter "output_contigset_name" of String, parameter "min_contig_length" of Long, parameter "kval_args" of type "kval_args_type" (Additional parameters: k values for idba_ud. (Note: The UI elements for these values have been removed, based on feedback)) -> structure: parameter "mink_arg" of Long, parameter "maxk_arg" of Long, parameter "step_arg" of Long :returns: instance of type "idba_ud_Output" (Output parameters for IDBA run. string report_name - the name of the KBaseReport.Report workspace object. string report_ref - the workspace reference of the report.) -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_idba_ud print("=================== IN run_idba_ud") print("PARAMS: ") pprint(params) print("============================ END OF PARAMS: ") # A whole lot of this is adapted or outright copied from # https://github.com/msneddon/MEGAHIT self.log('Running run_idba_ud with params:\n' + pformat(params)) token = ctx['token'] # the reads should really be specified as a list of absolute ws refs # but the narrative doesn't do that yet self.process_params(params) # get absolute refs from ws wsname = params[self.PARAM_IN_WS] obj_ids = [] for r in params[self.PARAM_IN_LIB]: obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)}) ws = workspaceService(self.workspaceURL, token=token) ws_info = ws.get_object_info_new({'objects': obj_ids}) reads_params = [] reftoname = {} for wsi, oid in zip(ws_info, obj_ids): ref = oid['ref'] reads_params.append(ref) obj_name = wsi[1] reftoname[ref] = wsi[7] + '/' + obj_name readcli = ReadsUtils(self.callbackURL, token=ctx['token']) typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' + 'KBaseFile.PairedEndLibrary ' + 'KBaseAssembly.SingleEndLibrary ' + 'KBaseAssembly.PairedEndLibrary') try: reads = readcli.download_reads({ 'read_libraries': reads_params, 'interleaved': 'false', 'gzipped': None })['files'] except ServerError as se: self.log('logging stacktrace from dynamic client error') self.log(se.data) if typeerr in se.message: prefix = se.message.split('.')[0] raise ValueError( prefix + '. Only the types ' + 'KBaseAssembly.PairedEndLibrary ' + 'and KBaseFile.PairedEndLibrary are supported') else: raise self.log('Got reads data from converter:\n' + pformat(reads)) self.check_reads(reads, reftoname) reads_data = [] for ref in reads: reads_name = reftoname[ref] f = reads[ref]['files'] print("REF:" + str(ref)) print("READS REF:" + str(reads[ref])) seq_tech = reads[ref]["sequencing_tech"] if f['type'] == 'interleaved': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'paired': reads_data.append({ 'fwd_file': f['fwd'], 'rev_file': f['rev'], 'type': 'paired', 'seq_tech': seq_tech }) elif f['type'] == 'single': reads_data.append({ 'fwd_file': f['fwd'], 'type': 'single', 'seq_tech': seq_tech }) else: raise ValueError('Something is very wrong with read lib' + reads_name) # set the output location timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) outdir = os.path.join(self.scratch, 'IDBA_dir' + str(timestamp)) idba_out = self.exec_idba_ud(reads_data, params, outdir) self.log('IDBA output dir: ' + idba_out) # parse the output and save back to KBase output_contigs = os.path.join(idba_out, 'contig.fa') self.log('Uploading FASTA file to Assembly') assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='dev') if params.get('min_contig_length', 0) > 0: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME], 'min_contig_length': params['min_contig_length'] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report( output_contigs + '.filtered.fa', params, wsname) else: assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': output_contigs }, 'workspace_name': wsname, 'assembly_name': params[self.PARAM_IN_CS_NAME] }) # load report from scaffolds.fasta report_name, report_ref = self.load_report(output_contigs, params, wsname) output = {'report_name': report_name, 'report_ref': report_ref} #END run_idba_ud # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_idba_ud return value ' + 'output is not type dict as required.') # return the results return [output]
def run_megahit(self, ctx, params): """ :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most parameters here are just passed forward to MEGAHIT workspace_name - the name of the workspace for input/output read_library_ref - the name of the PE read library (SE library support in the future) output_contig_set_name - the name of the output contigset megahit_parameter_preset - override a group of parameters; possible values: meta '--min-count 2 --k-list 21,41,61,81,99' (generic metagenomes, default) meta-sensitive '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more sensitive but slower) meta-large '--min-count 2 --k-list 27,37,47,57,67,77,87' (large & complex metagenomes, like soil) bulk '--min-count 3 --k-list 31,51,71,91,99 --no-mercy' (experimental, standard bulk sequencing with >= 30x depth) single-cell '--min-count 3 --k-list 21,33,55,77,99,121 --merge_level 20,0.96' (experimental, single cell data) min_count - minimum multiplicity for filtering (k_min+1)-mers, default 2 min_k - minimum kmer size (<= 127), must be odd number, default 21 max_k - maximum kmer size (<= 127), must be odd number, default 99 k_step - increment of kmer size of each iteration (<= 28), must be even number, default 10 k_list - list of kmer size (all must be odd, in the range 15-127, increment <= 28); override `--k-min', `--k-max' and `--k-step' min_contig_length - minimum length of contigs to output, default is 2000 @optional megahit_parameter_preset @optional min_count @optional k_min @optional k_max @optional k_step @optional k_list @optional min_contig_length) -> structure: parameter "workspace_name" of String, parameter "read_library_ref" of String, parameter "output_contigset_name" of String, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_length" of Long :returns: instance of type "MegaHitOutput" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_megahit print('Running run_megahit with params=') pprint(params) # STEP 1: basic parameter checks + parsing if 'workspace_name' not in params: raise ValueError('workspace_name parameter is required') if 'read_library_ref' not in params: raise ValueError('read_library_ref parameter is required') if 'output_contigset_name' not in params: raise ValueError('output_contigset_name parameter is required') # STEP 2: get the read library as deinterleaved fastq files input_ref = params['read_library_ref'] reads_params = {'read_libraries': [input_ref], 'interleaved': 'false', 'gzipped': None } ru = ReadsUtils(self.callbackURL) reads = ru.download_reads(reads_params)['files'] print('Input reads files:') fwd = reads[input_ref]['files']['fwd'] rev = reads[input_ref]['files']['rev'] pprint('forward: ' + fwd) pprint('reverse: ' + rev) # STEP 3: run megahit # construct the command megahit_cmd = [self.MEGAHIT] # we only support PE reads, so add that megahit_cmd.append('-1') megahit_cmd.append(fwd) megahit_cmd.append('-2') megahit_cmd.append(rev) # if a preset is defined, use that: if 'megahit_parameter_preset' in params: if params['megahit_parameter_preset']: megahit_cmd.append('--presets') megahit_cmd.append(params['megahit_parameter_preset']) if 'min_count' in params: if params['min_count']: megahit_cmd.append('--min-count') megahit_cmd.append(str(params['min_count'])) if 'k_min' in params: if params['k_min']: megahit_cmd.append('--k-min') megahit_cmd.append(str(params['k_min'])) if 'k_max' in params: if params['k_max']: megahit_cmd.append('--k-max') megahit_cmd.append(str(params['k_max'])) if 'k_step' in params: if params['k_step']: megahit_cmd.append('--k-step') megahit_cmd.append(str(params['k_step'])) if 'k_list' in params: if params['k_list']: k_list = [] for k_val in params['k_list']: k_list.append(str(k_val)) megahit_cmd.append('--k-list') megahit_cmd.append(','.join(k_list)) min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH if 'min_contig_length' in params: if params['min_contig_length']: if str(params['min_contig_length']).isdigit(): min_contig_length = params['min_contig_length'] else: raise ValueError('min_contig_length parameter must be a non-negative integer') megahit_cmd.append('--min-contig-len') megahit_cmd.append(str(min_contig_length)) # set the output location timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) megahit_cmd.append('-o') megahit_cmd.append(output_dir) # run megahit print('running megahit:') print(' ' + ' '.join(megahit_cmd)) p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False) retcode = p.wait() print('Return code: ' + str(retcode)) if p.returncode != 0: raise ValueError('Error running MEGAHIT, return code: ' + str(retcode) + '\n') output_contigs = os.path.join(output_dir, 'final.contigs.fa') # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there if self.mac_mode: shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa')) output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa') # STEP 4: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL) output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': {'path': output_contigs}, 'workspace_name': params['workspace_name'], 'assembly_name': params['output_contigset_name'] }) # STEP 5: generate and save the report # compute a simple contig length distribution for the report lengths = [] for seq_record in SeqIO.parse(output_contigs, 'fasta'): lengths.append(len(seq_record.seq)) report = '' report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' print('Running QUAST') kbq = kb_quast(self.callbackURL) try: quastret = kbq.run_QUAST({'files': [{'path': output_contigs, 'label': params['output_contigset_name']}]}) except QUASTError as qe: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from running QUAST') print(str(qe)) # TODO delete shock node raise print('Saving report') kbr = KBaseReport(self.callbackURL) try: report_info = kbr.create_extended_report( {'message': report, 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}], 'direct_html_link_index': 0, 'html_links': [{'shock_id': quastret['shock_id'], 'name': 'report.html', 'label': 'QUAST report'} ], 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()), 'workspace_name': params['workspace_name'] }) except _RepError as re: # not really any way to test this, all inputs have been checked earlier and should be # ok print('Logging exception from creating report object') print(str(re)) # TODO delete shock node raise # STEP 6: contruct the output to send back output = {'report_name': report_info['name'], 'report_ref': report_info['ref']} #END run_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method run_megahit return value ' + 'output is not type dict as required.') # return the results return [output]
def exec_megahit(self, ctx, params): """ :param params: instance of type "ExecMegaHitParams" (exec_megahit() Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as Input Creates Assembly object(s) as output. Will eventually also create AssemblySet object if input is a ReadsSet and not running a combined assembly Other vars same as run_megahit()) -> structure: parameter "workspace_name" of String, parameter "input_reads_ref" of String, parameter "output_contigset_name" of String, parameter "combined_assembly_flag" of Long, parameter "megahit_parameter_preset" of String, parameter "min_count" of Long, parameter "k_min" of Long, parameter "k_max" of Long, parameter "k_step" of Long, parameter "k_list" of list of Long, parameter "min_contig_len" of Long :returns: instance of type "ExecMegaHitOutput" -> structure: parameter "report_text" of String, parameter "output_contigset_ref" of list of String """ # ctx is the context object # return variables are: output #BEGIN exec_megahit console = [] self.log(console, 'Running exec_megahit() with params=') self.log(console, "\n" + pformat(params)) #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' ### STEP 0: init token = ctx['token'] wsClient = workspaceService(self.workspaceURL, token=token) headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token ### STEP 1: basic parameter checks + parsing required_params = [ 'workspace_name', 'input_reads_ref', 'output_contigset_name' ] for required_param in required_params: if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 2: determine if input is a ReadsLibrary or ReadsSet input_reads_ref = params['input_reads_ref'] input_reads_name = None try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_reads_ref }]})[0] input_reads_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_reads_obj_info[TYPE_I]) # remove trailing version input_reads_name = input_reads_obj_info[NAME_I] except Exception as e: raise ValueError('Unable to get reads object from workspace: (' + input_reads_ref + ')' + str(e)) accepted_input_types = [ "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary" ] if input_reads_obj_type not in accepted_input_types: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) if input_reads_obj_type == "KBaseSets.ReadsSet": required_param = 'combined_assembly_flag' if required_param not in params or params[required_param] == None: raise ValueError("Must define required param: '" + required_param + "'") ### STEP 3: get the list of library references if input_reads_obj_type == "KBaseFile.PairedEndLibrary": readsSet_ref_list = [input_reads_ref] readsSet_names_list = [input_reads_name] elif input_reads_obj_type == "KBaseSets.ReadsSet": readsSet_ref_list = [] readsSet_names_list = [] try: setAPI_Client = SetAPI( url=self.serviceWizardURL, token=ctx['token']) # for dynamic service #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # SDK local method except Exception as e: raise ValueError( "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '" + self.serviceWizardURL + "' token: '" + ctx['token'] + "'" + str(e)) #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e)) try: input_readsSet_obj = setAPI_Client.get_reads_set_v1({ 'ref': input_reads_ref, 'include_item_info': 1 }) except Exception as e: raise ValueError( 'SetAPI FAILURE: Unable to get read library set object from workspace: (' + str(input_reads_ref) + ")\n" + str(e)) for readsLibrary_obj in input_readsSet_obj['data']['items']: readsSet_ref_list.append(readsLibrary_obj['ref']) NAME_I = 1 readsSet_names_list.append(readsLibrary_obj['info'][NAME_I]) else: raise ValueError("Input reads of type '" + input_reads_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine if input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: self.log( console, "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES" ) # make dir timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) input_dir = os.path.join(self.scratch, 'input.' + str(timestamp)) if self.mac_mode: # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there input_dir = os.path.join(self.host_scratch, 'input.' + str(timestamp)) if not os.path.exists(input_dir): os.makedirs(input_dir) # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # start combined file read_buf_size = 65536 write_buf_size = 65536 combined_input_fwd_path = os.path.join(input_dir, 'input_reads_fwd.fastq') combined_input_rev_path = os.path.join(input_dir, 'input_reads_rev.fastq') combined_input_fwd_handle = open(combined_input_fwd_path, 'w', write_buf_size) combined_input_rev_handle = open(combined_input_rev_path, 'w', write_buf_size) # add libraries, one at a time for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] # append fwd self.log( console, "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) this_input_path = this_input_fwd_path cat_file_handle = combined_input_fwd_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file # append rev this_input_path = this_input_rev_path cat_file_handle = combined_input_rev_handle with open(this_input_path, 'r', read_buf_size) as this_input_handle: while True: read_data = this_input_handle.read(read_buf_size) if read_data: cat_file_handle.write(read_data) else: break os.remove( this_input_path ) # create space since we no longer need the piece file combined_input_fwd_handle.close() combined_input_rev_handle.close() ### STEP 5: finally run MegaHit_Sets exec_megahit_single_library_params = params output_assemblyset_contigset_paths = [] output_contigset_path = None # PairedEndLibrary if input_reads_obj_type == "KBaseFile.PairedEndLibrary": self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: " + str(input_reads_ref)) try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + input_reads_ref + ")\n" + str(e)) input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][ 'fwd'] input_rev_path = readsLibrary['files'][input_reads_ref]['files'][ 'rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet combined (already downloaded and combined fastqs) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] != 0: input_fwd_path = combined_input_fwd_path input_rev_path = combined_input_rev_path exec_megahit_single_library_params[ 'input_fwd_path'] = input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = input_rev_path # the key line output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append(output_contigset_path) os.remove(input_fwd_path) # files can be really big os.remove(input_rev_path) # ReadsSet uncombined (still have to download) elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[ 'combined_assembly_flag'] == 0: # connect to ReadsUtils Client try: readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local except: raise ValueError("Unable to get readsUtils_Client\n" + str(e)) # get libraries, one at a time, and run MegaHit_Sets output_assemblyset_contigset_paths = [] for this_input_reads_ref in readsSet_ref_list: self.log( console, "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: " + str(this_input_reads_ref)) try: readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [this_input_reads_ref], 'interleaved': 'false' }) except Exception as e: raise ValueError( 'Unable to get reads object from workspace: (' + this_input_reads_ref + ")\n" + str(e)) this_input_fwd_path = readsLibrary['files'][ this_input_reads_ref]['files']['fwd'] this_input_rev_path = readsLibrary['files'][ this_input_reads_ref]['files']['rev'] exec_megahit_single_library_params[ 'input_fwd_path'] = this_input_fwd_path exec_megahit_single_library_params[ 'input_rev_path'] = this_input_rev_path # the key line this_output_contigset_path = self.exec_megahit_single_library( exec_megahit_single_library_params) output_assemblyset_contigset_paths.append( this_output_contigset_path) os.remove(this_input_fwd_path) # files can be really big os.remove(this_input_rev_path) # just in case we've confused ourselves else: raise ValueError("error in logic") ### STEP 6: save the resulting assembly assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) output_contigset_refs = [] output_contigset_names = [] for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): if len(output_assemblyset_contigset_paths) == 1: assembly_name = params['output_contigset_name'] else: assembly_name = readsSet_names_list[i] + '-' + params[ 'output_contigset_name'] this_output_data_ref = assemblyUtil.save_assembly_from_fasta({ 'file': { 'path': this_output_contigset_path }, 'workspace_name': params['workspace_name'], 'assembly_name': assembly_name }) output_contigset_refs.append(this_output_data_ref) output_contigset_names.append(assembly_name) ### STEP 7: generate the report text # compute a simple contig length distribution for the report report = '' for i, this_output_contigset_path in enumerate( output_assemblyset_contigset_paths): report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[ i] + "\n" report += "-------------------------------------------------------------\n" report += "\n" lengths = [] for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'): lengths.append(len(seq_record.seq)) report += 'ContigSet saved to: ' + params[ 'workspace_name'] + '/' + output_contigset_names[i] + '\n' report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n' report += 'Avg Length: ' + str( sum(lengths) / float(len(lengths))) + ' bp.\n' bins = 10 counts, edges = np.histogram(lengths, bins) report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n' for c in range(bins): report += ' ' + str(counts[c]) + '\t--\t' + str( edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n' ### STEP 8: contruct the output to send back output = { 'report_text': report, 'output_contigset_refs': output_contigset_refs } #END exec_megahit # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method exec_megahit return value ' + 'output is not type dict as required.') # return the results return [output]