def setUpClass(cls): token = environ.get('KB_AUTH_TOKEN', None) user_id = requests.post( 'https://kbase.us/services/authorization/Sessions/Login', data='token={}&fields=user_id'.format(token)).json()['user_id'] # WARNING: don't call any logging methods on the context object, # it'll result in a NoneType error cls.ctx = MethodContext(None) cls.ctx.update({'token': token, 'user_id': user_id, 'provenance': [ {'service': 'SetAPI', 'method': 'please_never_use_it_in_production', 'method_params': [] }], 'authenticated': 1}) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('SetAPI'): cls.cfg[nameval[0]] = nameval[1] cls.wsURL = cls.cfg['workspace-url'] cls.wsClient = workspaceService(cls.wsURL, token=token) cls.serviceImpl = SetAPI(cls.cfg) cls.serviceWizardURL = cls.cfg['service-wizard'] cls.dataPaletteServiceVersion = cls.cfg['datapaletteservice-version'] # setup data at the class level for now (so that the code is run # once for all tests, not before each test case. Not sure how to # do that outside this function..) suffix = int(time.time() * 1000) wsName = "test_SetAPI_" + str(suffix) ret = cls.wsClient.create_workspace({'workspace': wsName}) cls.wsName = wsName # copy test file to scratch area fq_filename = "interleaved.fastq" fq_path = os.path.join(cls.cfg['scratch'], fq_filename) shutil.copy(os.path.join("data", fq_filename), fq_path) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) cls.read1ref = ru.upload_reads({ 'fwd_file': fq_path, 'sequencing_tech': 'tech1', 'wsname': wsName, 'name': 'reads1', 'interleaved':1 })['obj_ref'] cls.read2ref = ru.upload_reads({ 'fwd_file': fq_path, 'sequencing_tech': 'tech2', 'wsname': wsName, 'name': 'reads2', 'interleaved':1 })['obj_ref']
def upload_test_reads(cls): """ Seeding an initial SE and PE Reads objects to test filtering """ header = dict() header["Authorization"] = "Oauth {0}".format(cls.token) # readsUtils_Client = ReadsUtils(url=self.callback_url, token=ctx['token']) # SDK local readsUtils_Client = ReadsUtils(os.environ['SDK_CALLBACK_URL'], token=cls.token) temp_nodes = [] fwdtf = 'small_forward.fq' revtf = 'small_reverse.fq' fwdtarget = os.path.join(cls.scratch, fwdtf) revtarget = os.path.join(cls.scratch, revtf) print "CWD: " + str(os.getcwd()) shutil.copy('/kb/module/test/data/' + fwdtf, fwdtarget) shutil.copy('/kb/module/test/data/' + revtf, revtarget) # Upload single end reads cls.se_reads_reference = \ readsUtils_Client.upload_reads({'wsname': cls.getWsName(), 'name': "se_reads", 'sequencing_tech': 'Illumina', 'fwd_file': fwdtarget} )['obj_ref'] se_data = cls.dfu.get_objects( {'object_refs': [cls.getWsName() + '/se_reads']})['data'][0]['data'] temp_nodes.append(se_data['lib']['file']['id']) # Upload paired end reads cls.pe_reads_reference = \ readsUtils_Client.upload_reads({'wsname': cls.getWsName(), 'name': "pe_reads", 'sequencing_tech': 'Illumina', 'fwd_file': fwdtarget, 'rev_file': revtarget, 'insert_size_mean': 42, 'insert_size_std_dev': 10, } )['obj_ref'] pe_data = cls.dfu.get_objects( {'object_refs': [cls.getWsName() + '/pe_reads']})['data'][0]['data'] temp_nodes.append(pe_data['lib1']['file']['id']) return temp_nodes
def loadPairedEndReads(self): if hasattr(self.__class__, 'pairedEndLibInfo'): return self.__class__.pairedEndLibInfo # 1) upload files to shock shared_dir = "/kb/module/work/tmp" forward_data_file = '../work/testReads/small.forward.fq' forward_file = os.path.join(shared_dir, os.path.basename(forward_data_file)) shutil.copy(forward_data_file, forward_file) reverse_data_file = '../work/testReads/small.reverse.fq' reverse_file = os.path.join(shared_dir, os.path.basename(reverse_data_file)) shutil.copy(reverse_data_file, reverse_file) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) pe_reads_ref = ru.upload_reads({ 'fwd_file': forward_file, 'rev_file': reverse_file, 'sequencing_tech': 'artificial reads', 'interleaved': 0, 'wsname': self.getWsName(), 'name': 'test_pe_reads' })['obj_ref'] self.__class__.pe_reads_ref = pe_reads_ref print('Loaded PairedEndReads: ' + pe_reads_ref) new_obj_info = self.wsClient.get_object_info_new( {'objects': [{ 'ref': pe_reads_ref }]}) self.__class__.pairedEndLibInfo = new_obj_info[0] pprint(pformat(new_obj_info)) #return new_obj_info[0] return pe_reads_ref
def test_fastqc_app(self): # create ws, and load test reads wsName = self.getWsName() ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) input_file_ref = ru.upload_reads({ 'fwd_file': self.small_fq_test_file2, 'sequencing_tech': 'tech1', 'wsname': wsName, 'name': 'reads1', 'interleaved': 1 })['obj_ref'] input_params = {'input_ws': wsName, 'input_file_ref': input_file_ref} output = self.getImpl().runFastQC(self.getContext(), input_params)[0] self.assertIn('report_name', output) self.assertIn('report_ref', output) # pprint(output) report = self.getWsClient().get_objects2( {'objects': [{ 'ref': output['report_ref'] }]})['data'][0]['data'] # pprint(report) self.assertIn('direct_html', report) self.assertIn('file_links', report) self.assertIn('html_links', report) self.assertIn('objects_created', report) self.assertIn('text_message', report)
def upload_fastq(self, ctx, params): """ :param params: instance of type "UploadFastqParams" (testing invocation of ReadsUtils) -> structure: parameter "fwd_id" of String, parameter "wsid" of Long, parameter "wsname" of String, parameter "objid" of Long, parameter "name" of String, parameter "rev_id" of String, parameter "sequencing_tech" of String :returns: instance of type "UploadFastqObjref" """ # ctx is the context object # return variables are: objref #BEGIN upload_fastq print("hai this is upload_fastq here, params are") pprint.pprint(params) ReadsUtils_instance = ReadsUtils(url=self.callbackURL, token=ctx['token'], service_ver='dev') print("got ReadsUtilsinstance") method_retVal = ReadsUtils_instance.upload_reads(params) print("back from ReadsUtils_instance.upload_reads") pprint(method_retVal) objref = "Vooch" #END upload_fastq # At some point might do deeper type checking... if not isinstance(objref, basestring): raise ValueError('Method upload_fastq return value ' + 'objref is not type basestring as required.') # return the results return [objref]
def _package_result(self, output_file, output_name, ws_name_or_id, data_info, report): upload_params = {'fwd_file': output_file, 'name': output_name} if str(ws_name_or_id).isdigit(): upload_params['wsid'] = int(ws_name_or_id) else: upload_params['wsname'] = str(ws_name_or_id) fields = [ 'sequencing_tech', 'strain', 'source', 'read_orientation_outward', 'insert_size_mean', 'insert_size_std_dev' ] if 'input_ref' in data_info and data_info[ 'input_ref'] != None and data_info['sequencing_tech']: upload_params['source_reads_ref'] = data_info['input_ref'] else: for f in fields: if f in data_info: upload_params[f] = data_info[f] if 'single_genome' in data_info: if data_info['single_genome'] == 'true': upload_params['single_genome'] = 1 elif data_info['single_genome'] == 'false': upload_params['single_genome'] = 0 if 'sequencing_tech' not in upload_params: upload_params['sequencing_tech'] = 'unknown' if not upload_params['sequencing_tech']: upload_params['sequencing_tech'] = 'unknown' if data_info['files']['type'] == 'interleaved': upload_params['interleaved'] = 1 ru = ReadsUtils(self.callbackURL) result = ru.upload_reads(upload_params) # THE REPORT MUST BE CREATED OUTSIDE SO THAT LIBS AND SETS ARE HANDLED """ # create report kbreport = KBaseReport(self.callbackURL) rep = kbreport.create({ 'report': { 'text_message': report, 'objects_created': [{ "ref": str(ws_name_or_id) + '/' + upload_params['name'], "description": '' }] }, "workspace_name": str(ws_name_or_id) }) return { 'report_ref': rep['ref'], 'report_name': rep['name'], 'output_reads_ref': result['obj_ref'] } """ return {'report': report, 'output_reads_ref': result['obj_ref']}
def load_pe_reads(fwd_file, rev_file): """ Copies from given dir to scratch. Then calls ReadsUtils to upload from scratch. """ callback_url = os.environ['SDK_CALLBACK_URL'] fwd_file_path = file_to_scratch(fwd_file, overwrite=True) rev_file_path = file_to_scratch(rev_file, overwrite=True) ru = ReadsUtils(callback_url) pe_reads_params = { 'fwd_file': fwd_file_path, 'rev_file': rev_file_path, 'sequencing_tech': 'Illumina', 'wsname': get_ws_name(), 'name': 'MyPairedEndLibrary' } return ru.upload_reads(pe_reads_params)['obj_ref']
def load_reads_file(self, tech, file_fwd, file_rev, target_name): """ Loads FASTQ files as either SingleEndLibrary or PairedEndLibrary. If file_rev is None, then we get a single end, otherwise, paired. """ reads_util = ReadsUtils(self.callback_url) upload_params = { "wsname": self.ws_name, "fwd_file": file_fwd, "name": target_name, "sequencing_tech": tech } if file_rev is not None: upload_params["rev_file"] = file_rev reads_ref = reads_util.upload_reads(upload_params) return reads_ref["obj_ref"]
def loadSingleEndReads(self): if hasattr(self.__class__, 'se_reads_ref'): return self.__class__.se_reads_ref # return '23735/2/1' fq_path = os.path.join(self.scratch, 'reads_1_se.fq') shutil.copy(os.path.join('data', 'reads_1.fq'), fq_path) ru = ReadsUtils(self.callback_url) se_reads_ref = ru.upload_reads({ 'fwd_file': fq_path, 'wsname': self.getWsName(), 'name': 'test_readsSE', 'sequencing_tech': 'artificial reads' })['obj_ref'] self.__class__.se_reads_ref = se_reads_ref print('Loaded SingleEndReads: ' + se_reads_ref) return se_reads_ref
def loadSEReads(self, reads_file_path): #if hasattr(self.__class__, 'reads_ref'): #return self.__class__.reads_ref se_reads_name = os.path.basename(reads_file_path) fq_path = os.path.join(self.scratch, se_reads_name) #'star_test_reads.fastq') shutil.copy(reads_file_path, fq_path) ru = ReadsUtils(self.callback_url) reads_ref = ru.upload_reads({ 'fwd_file': fq_path, 'wsname': self.getWsName(), 'name': se_reads_name.split('.')[0], 'sequencing_tech': 'rnaseq reads' })['obj_ref'] #self.__class__.reads_ref = reads_ref return reads_ref
def _upload_reads(self, refid, callbackURL, input_params): ref = [refid] DownloadReadsParams = {'read_libraries': ref} dfUtil = ReadsUtils(callbackURL) x = dfUtil.download_reads(DownloadReadsParams) uploadReadParams = {} fwd_file = x['files'][ref[0]]['files']['fwd'] otype = x['files'][ref[0]]['files']['otype'] #case of interleaved if (otype == 'interleaved'): uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': '', 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'], 'interleaved': 1 } #case of separate pair if (otype == 'paired'): rev_file = x['files'][ref[0]]['files']['rev'] uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': rev_file, 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'] } #case of single end if (otype == 'single'): uploadReadParams = { 'fwd_file': fwd_file, 'wsname': input_params['workspace_name'], 'name': input_params['output'], 'rev_file': '', 'sequencing_tech': input_params['sequencing_tech'], 'single_genome': input_params['single_genome'] } y = dfUtil.upload_reads(uploadReadParams) return y['obj_ref']
def getPairedEndLibInfo(self): if hasattr(self.__class__, 'pairedEndLibInfo'): return self.__class__.pairedEndLibInfo # copy the local test file to the shared scratch space so that the ReadsUtils # container can see it. test_fastq_file_local = 'data/interleaved.fastq' test_fastq_file_scratch = os.path.join(self.scratch, os.path.basename(test_fastq_file_local)) shutil.copy(test_fastq_file_local, test_fastq_file_scratch) # call the ReadsUtils libary to upload the test data to KBase ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) paired_end_ref = ru.upload_reads({'fwd_file': test_fastq_file_scratch, 'sequencing_tech': 'artificial reads', 'interleaved': 1, 'wsname': self.getWsName(), 'name': 'test.pe.reads'})['obj_ref'] # get the object metadata for the new test dataset new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': paired_end_ref}]}) self.__class__.pairedEndLibInfo = new_obj_info[0] return new_obj_info[0]
def getPairedEndLibInfo(self): if hasattr(self.__class__, 'pairedEndLibInfo'): return self.__class__.pairedEndLibInfo # 1) upload files to shock shared_dir = "/kb/module/work/tmp" forward_data_file = 'data/small.forward.fq' forward_file = os.path.join(shared_dir, os.path.basename(forward_data_file)) shutil.copy(forward_data_file, forward_file) reverse_data_file = 'data/small.reverse.fq' reverse_file = os.path.join(shared_dir, os.path.basename(reverse_data_file)) shutil.copy(reverse_data_file, reverse_file) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) paired_end_ref = ru.upload_reads({'fwd_file': forward_file, 'rev_file': reverse_file, 'sequencing_tech': 'artificial reads', 'interleaved': 0, 'wsname': self.getWsName(), 'name': 'test.pe.reads'})['obj_ref'] new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': paired_end_ref}]}) self.__class__.pairedEndLibInfo = new_obj_info[0] return new_obj_info[0]
def test_mash_sketch_valid_reads_ref(self): dir_path = os.path.dirname(os.path.realpath(__file__)) reads_file_name = 'reads-example.fastq' reads_test_path = os.path.join(dir_path, 'data', reads_file_name) reads_scratch_path = os.path.join(self.scratch, reads_file_name) shutil.copy(reads_test_path, reads_scratch_path) reads_utils = ReadsUtils(self.callback_url) upload_result = reads_utils.upload_reads({ 'wsname': self.getWsName(), 'interleaved': 'true', 'fwd_file': reads_scratch_path, 'name': 'example-reads', 'sequencing_tech': 'illumina' }) reads_ref = upload_result['obj_ref'] params = {'reads_ref': reads_ref, 'paired_ends': True} result = self.getImpl().run_mash_sketch(self.getContext(), params) output_path = result[0]['sketch_path'] with open(output_path, 'rb') as output_file: num_lines = sum(1 for line in output_file) self.assertTrue(os.path.exists(output_path)) self.assertEqual(num_lines, 25)
def loadPairedEndReads(self): if hasattr(self.__class__, 'pe_reads_ref'): return self.__class__.pe_reads_ref # return '23735/3/1' fq_path1 = os.path.join(self.scratch, 'reads_1.fq') shutil.copy(os.path.join('data', 'bt_test_data', 'reads_1.fq'), fq_path1) fq_path2 = os.path.join(self.scratch, 'reads_2.fq') shutil.copy(os.path.join('data', 'bt_test_data', 'reads_2.fq'), fq_path2) ru = ReadsUtils(self.callback_url) pe_reads_ref = ru.upload_reads({ 'fwd_file': fq_path1, 'rev_file': fq_path2, 'wsname': self.getWsName(), 'name': 'test_readsPE', 'sequencing_tech': 'artificial reads' })['obj_ref'] self.__class__.pe_reads_ref = pe_reads_ref print('Loaded PairedEndReads: ' + pe_reads_ref) return pe_reads_ref
def upload_reads(self, file_path, workspace_name, reads_name, source_reads_upa): """ Upload the given contigs file as an interleaved PE reads object. """ if not file_path: raise ValueError("file_path must be defined") if not os.path.exists(file_path): raise ValueError("The given reads file '{}' does not exist".format(file_path)) if not workspace_name: raise ValueError("workspace_name must be defined") if not reads_name: raise ValueError("reads_name must be defined") ru = ReadsUtils(self.callback_url) reads_upa = ru.upload_reads({ "wsname": workspace_name, "fwd_file": file_path, "name": reads_name, "source_reads_ref": source_reads_upa, "interleaved": 1 })["obj_ref"] return reads_upa
def _upload_file_path(self, params): """ _upload_file_path: upload fastq file as reads from user's staging area params: fwd_staging_file_name: single-end fastq file name or forward/left paired-end fastq file name from user's staging area sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID that reads will be stored to optional params: rev_staging_file_name: reverse/right paired-end fastq file name user's staging area single_genome: whether the reads are from a single genome or a metagenome insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward interleaved: whether reads is interleaved """ log('---> running UploaderUtil._upload_file_path') upload_file_params = params workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): upload_file_params['wsid'] = int(workspace_name_or_id) else: upload_file_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(upload_file_params, indent=1))) ru = ReadsUtils(self.callback_url) result = ru.upload_reads(upload_file_params) return result
def getPairedEndLibInfo(self): input_reads_file = '/kb/module/test/data/small_test_reads.fastq' #input_reads_file = '/kb/module/test/data/12040.half_million.fastq' shared_dir = "/kb/module/work/tmp" input_file = os.path.join(shared_dir, os.path.basename(input_reads_file)) shutil.copy(input_reads_file, input_file) ru = ReadsUtils(os.environ['SDK_CALLBACK_URL']) paired_end_ref = ru.upload_reads({ 'fwd_file': input_file, 'sequencing_tech': 'artificial reads', 'interleaved': 1, 'wsname': self.getWsName(), 'name': 'test.pe.reads' })['obj_ref'] new_obj_info = self.wsClient.get_object_info_new( {'objects': [{ 'ref': paired_end_ref }]}) return new_obj_info[0]
def upload_interleaved_reads(callback_url, reads_file, ws_name, reads_obj_name, source_reads_upa): """ callback_url = as usual. reads_file = full path to the reads file to upload ws_name = the workspace to use for uploading the reads file reads_obj_name = the name of the new reads object to save as source_reads = if not None, the source UPA for the original reads file. """ # unfortunately, the ReadsUtils only accepts uncompressed fq files- this should # be fixed on the KBase side dfu = DataFileUtil(callback_url) reads_unpacked = dfu.unpack_file({'file_path': reads_file})['file_path'] ru = ReadsUtils(callback_url) new_reads_upa = ru.upload_reads({ 'fwd_file': reads_unpacked, 'interleaved': 1, 'wsname': ws_name, 'name': reads_obj_name, 'source_reads_ref': source_reads_upa })['obj_ref'] print('saved ' + str(reads_unpacked) + ' to ' + str(new_reads_upa)) return new_reads_upa
def execReadLibraryPRINSEQ(self, ctx, input_params): """ :param input_params: instance of type "inputPRINSEQ" (execPRINSEQ and execReadLibraryPRINSEQ input input_reads_ref : may be KBaseFile.PairedEndLibrary or KBaseFile.SingleEndLibrary output_ws : workspace to write to output_reads_name : obj_name to create lc_method : Low complexity method - value must be "dust" or "entropy" lc_entropy_threshold : Low complexity threshold - Value must be an integer between 0 and 100. Note a higher lc_entropy_threshold in entropy is more stringent. lc_dust_threshold : Low complexity threshold - Value must be an integer between 0 and 100. Note a lower lc_entropy_threshold is less stringent with dust) -> structure: parameter "input_reads_ref" of type "data_obj_ref", parameter "output_ws" of type "workspace_name" (Common Types), parameter "output_reads_name" of type "data_obj_name", parameter "lc_method" of String, parameter "lc_entropy_threshold" of Long, parameter "lc_dust_threshold" of Long :returns: instance of type "outputReadLibraryExecPRINSEQ" -> structure: parameter "output_filtered_ref" of type "data_obj_ref", parameter "output_unpaired_fwd_ref" of type "data_obj_ref", parameter "output_unpaired_rev_ref" of type "data_obj_ref", parameter "report" of String """ # ctx is the context object # return variables are: output #BEGIN execReadLibraryPRINSEQ console = [] # self.log(console, 'Running execTrimmomatic with parameters: ') # self.log(console, "\n"+pformat(input_params)) report = '' returnVal = dict() # retVal['output_filtered_ref'] = None # retVal['output_unpaired_fwd_ref'] = None # retVal['output_unpaired_rev_ref'] = None token = ctx['token'] wsClient = workspaceService(self.ws_url, token=token) env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # param checks required_params = ['input_reads_ref', 'output_ws', 'lc_method'] # output reads_name is optional. If not set will use old_objects name for required_param in required_params: if required_param not in input_params or input_params[ required_param] is None: raise ValueError("Must define required param: '" + required_param + "'") if (input_params['lc_method'] != 'dust') and (input_params['lc_method'] != 'entropy'): raise ValueError( "lc_method (low complexity method) must be 'dust' or 'entropy', " + "it is currently set to : " + input_params['lc_method']) if not ('lc_entropy_threshold' in input_params or 'lc_dust_threshold' in input_params): raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) elif input_params['lc_method'] == 'dust': if 'lc_dust_threshold' not in input_params: raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) else: lc_threshold = input_params['lc_dust_threshold'] else: if 'lc_entropy_threshold' not in input_params: raise ValueError( ("A low complexity threshold needs to be " + "entered for {}".format(input_params['lc_method']))) else: lc_threshold = input_params['lc_entropy_threshold'] if (lc_threshold < 0.0) or (lc_threshold > 100.0): raise ValueError(( "The threshold for {} must be between 0 and 100, it is currently " + "set to : {}").format(input_params['lc_method'], lc_threshold)) reportObj = {'objects_created': [], 'text_message': ''} # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [ str(input_params['input_reads_ref']) ] # GET THE READS OBJECT # Determine whether read library or read set is input object # try: # object_info tuple [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) input_reads_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_params['input_reads_ref'] }]})[0] input_reads_obj_type = input_reads_obj_info[TYPE_I] # input_reads_obj_version = input_reads_obj_info[VERSION_I] # this is object version, not type version except Exception as e: raise ValueError( 'Unable to get read library object from workspace: (' + str(input_params['input_reads_ref']) + ')' + str(e)) # self.log (console, "B4 TYPE: '" + # str(input_reads_obj_type) + # "' VERSION: '" + str(input_reads_obj_version)+"'") # remove trailing version input_reads_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", input_reads_obj_type) # self.log (console, "AF TYPE: '"+str(input_reads_obj_type)+"' VERSION: '" + # str(input_reads_obj_version)+"'") # maybe add below later "KBaseSets.ReadsSet", acceptable_types = [ "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary", "KBaseFile.SingleEndLibrary" ] if input_reads_obj_type not in acceptable_types: raise ValueError("Input reads of type: '" + input_reads_obj_type + "'. Must be one of " + ", ".join(acceptable_types)) if input_reads_obj_type in [ "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary" ]: read_type = 'PE' elif input_reads_obj_type in [ "KBaseFile.SingleEndLibrary", "KBaseAssembly.SingleEndLibrary" ]: read_type = 'SE' # Instatiate ReadsUtils try: readsUtils_Client = ReadsUtils(url=self.callback_url, token=ctx['token']) # SDK local self._log(None, 'Starting Read File(s) Download') readsLibrary = readsUtils_Client.download_reads({ 'read_libraries': [input_params['input_reads_ref']], 'interleaved': 'false' }) self._log(None, 'Completed Read File(s) Downloading') except Exception as e: raise ValueError( ('Unable to get read library object from workspace: ({})\n' ).format(str(input_params['input_reads_ref']), str(e))) # get WS metadata to get obj_name ws = workspaceService(self.ws_url) try: info = ws.get_object_info_new( {'objects': [{ 'ref': input_params['input_reads_ref'] }]})[0] except workspaceService as wse: self._log(console, 'Logging workspace exception') self._log(str(wse)) raise #determine new object base name new_object_name = info[1] if ('output_reads_name' in input_params and input_params['output_reads_name'] != '' and input_params['output_reads_name'] is not None): new_object_name = input_params['output_reads_name'] # MAKE A DIRECTORY TO PUT THE READ FILE(S) # create the output directory and move the file there # PUT FILES INTO THE DIRECTORY # Sanitize the file names tempdir = tempfile.mkdtemp(dir=self.scratch) export_dir = os.path.join(tempdir, info[1]) os.makedirs(export_dir) if read_type == 'PE': # IF PAIRED END, potentially 6 files created # one of each for the two directions(good(paired), good_singletons, bad) # Take the good paired and (re)upload new reads object. # We throwout the bad reads input_files_info = self._setup_pe_files(readsLibrary, export_dir, input_params) # RUN PRINSEQ with user options (lc_method and lc_threshold) cmd = ( "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} " "-fastq2 {} -out_format 3 -lc_method {} " "-lc_threshold {}").format( input_files_info["fastq_file_path"], input_files_info["fastq2_file_path"], input_params['lc_method'], lc_threshold) print "Command to be run : " + cmd args = shlex.split(cmd) perl_script = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = perl_script.communicate() found_results = False file_names_dict = dict() for element in output: if "Input and filter stats:" in element: found_results = True element_parts = element.split("Input and filter stats:") # PRINSEQ OUTPUT report = "Input and filter stats:{}".format( element_parts[1]) reportObj['text_message'] = report read_files_list = os.listdir(export_dir) # proc = subprocess.Popen(['ls', '-l', export_dir], stdout=subprocess.PIPE) # proc_output = proc.stdout.read() # print "PROC OUTPUT : " + proc_output for read_filename in read_files_list: file_direction = None print "Read File : {}".format(read_filename) # determine if forward(fastq) or reverse(fastq2) file if input_files_info["fastq_filename"] in read_filename: file_direction = "fwd" elif input_files_info[ "fastq2_filename"] in read_filename: file_direction = "rev" if file_direction is not None: # determine good singleton or good part of a pair. print "TEST: {}_prinseq_good_".format( input_files_info["fastq_filename"]) if ("{}_prinseq_good_singletons".format( input_files_info["fastq_filename"]) in read_filename or "{}_prinseq_good_singletons".format( input_files_info["fastq2_filename"]) in read_filename): # Unpaired singletons that need to be # saved as a new single end reads object file_names_dict["{}_good_singletons".format(file_direction)] = \ os.path.join(export_dir, read_filename) elif ("{}_prinseq_good_".format( input_files_info["fastq_filename"]) in read_filename or "{}_prinseq_good_".format( input_files_info["fastq2_filename"]) in read_filename): file_names_dict["{}_good_pair".format(file_direction)] = \ os.path.join(export_dir, read_filename) if (('fwd_good_pair' in file_names_dict) and ('rev_good_pair' in file_names_dict)): self._log(None, 'Saving new Paired End Reads') returnVal['filtered_paired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': new_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['fwd_good_pair'], 'rev_file': file_names_dict['rev_good_pair'] } )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['filtered_paired_end_ref'], 'description': 'Filtered Paired End Reads', 'object_name': new_object_name }) print "REFERENCE : " + str( returnVal['filtered_paired_end_ref']) else: reportObj['text_message'] += \ "\n\nNo good matching pairs passed low complexity filtering.\n" + \ "Consider loosening the threshold value.\n" if 'fwd_good_singletons' in file_names_dict: self._log(None, 'Saving new Forward Unpaired Reads') fwd_object_name = "{}_fwd_singletons".format( new_object_name) returnVal['output_filtered_fwd_unpaired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': fwd_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['fwd_good_singletons']} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_fwd_unpaired_end_ref'], 'description': 'Filtered Forward Unpaired End Reads', 'object_name': fwd_object_name }) print "REFERENCE : " + \ str(returnVal['output_filtered_fwd_unpaired_end_ref']) if 'rev_good_singletons' in file_names_dict: self._log(None, 'Saving new Reverse Unpaired Reads') rev_object_name = "{}_rev_singletons".format( new_object_name) returnVal['output_filtered_rev_unpaired_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': rev_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': file_names_dict['rev_good_singletons']} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_rev_unpaired_end_ref'], 'description': 'Filtered Reverse Unpaired End Reads', 'object_name': rev_object_name }) print "REFERENCE : " + \ str(returnVal['output_filtered_rev_unpaired_end_ref']) if len(reportObj['objects_created']) > 0: reportObj['text_message'] += "\nOBJECTS CREATED :\n" for obj in reportObj['objects_created']: reportObj['text_message'] += "{} : {}\n".format( obj['object_name'], obj['description']) else: reportObj['text_message'] += \ "\nFiltering filtered out all reads. No objects made.\n" if not found_results: raise Exception('Unable to execute PRINSEQ, Error: {}'.format( str(output))) print "FILES DICT : {}".format(str(file_names_dict)) print "REPORT OBJECT :" print str(reportObj) elif read_type == 'SE': # Download reads Libs to FASTQ files # IF SINGLE END INPUT 2 files created (good and bad) # Take good and (re)upload new reads object input_fwd_file_path = \ readsLibrary['files'][input_params['input_reads_ref']]['files']['fwd'] fastq_filename = self._sanitize_file_name( os.path.basename(input_fwd_file_path)) fastq_file_path = os.path.join(export_dir, fastq_filename) shutil.move(input_fwd_file_path, fastq_file_path) # RUN PRINSEQ with user options (lc_method and lc_threshold) cmd = ( "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} " "-out_format 3 -lc_method {} " "-lc_threshold {}").format(fastq_file_path, input_params['lc_method'], lc_threshold) print "Command to be run : " + cmd args = shlex.split(cmd) print "ARGS: " + str(args) perl_script = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) output = perl_script.communicate() print "OUTPUT: " + str(output) found_results = False found_se_filtered_file = False file_names_dict = dict() for element in output: if "Input and filter stats:" in element: found_results = True element_parts = element.split("Input and filter stats:") # PRINSEQ OUTPUT report = "Input and filter stats:{}".format( element_parts[1]) reportObj['text_message'] = report read_files_list = os.listdir(export_dir) for read_filename in read_files_list: print "Early Read File : {}".format(read_filename) for read_filename in read_files_list: print "Read File : {}".format(read_filename) if ("{}_prinseq_good_".format(fastq_filename) in read_filename): #Found Good file. Save the Reads objects self._log(None, 'Saving Filtered Single End Reads') returnVal['output_filtered_single_end_ref'] = \ readsUtils_Client.upload_reads({'wsname': str(input_params['output_ws']), 'name': new_object_name, 'source_reads_ref': input_params['input_reads_ref'], 'fwd_file': os.path.join(export_dir, read_filename)} )['obj_ref'] reportObj['objects_created'].append({ 'ref': returnVal['output_filtered_single_end_ref'], 'description': 'Filtered Single End Reads' }) print "REFERENCE : " + str( returnVal['output_filtered_single_end_ref']) found_se_filtered_file = True break if not found_se_filtered_file: reportObj['text_message'] += \ "\n\nNone of the reads passed low complexity filtering.\n" + \ "Consider loosening the threshold value.\n" if not found_results: raise Exception('Unable to execute PRINSEQ, Error: {}'.format( str(output))) print "FILES DICT : {}".format(str(file_names_dict)) print "REPORT OBJECT :" print str(reportObj) # save report object # report = KBaseReport(self.callback_url, token=ctx['token']) #report = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER) report_info = report.create({ 'report': reportObj, 'workspace_name': input_params['output_ws'] }) output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END execReadLibraryPRINSEQ # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method execReadLibraryPRINSEQ return value ' + 'output is not type dict as required.') # return the results return [output]
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed command:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running command:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path, params): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) handler_utils._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: self._validate_paired_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: self._validate_single_end_advanced_params(params) fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file} return fastq_file_path def _validate_single_end_advanced_params(self, params): """ _validate_single_end_advanced_params: validate advanced params for single end reads """ if (params.get('insert_size_mean') or params.get('insert_size_std_dev') or params.get('read_orientation_outward')): error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or ' error_msg += '"Reads Orientation Outward" is Paried End Reads specific' raise ValueError(error_msg) if 'interleaved' in params: del params['interleaved'] def _validate_paired_end_advanced_params(self, params): """ _validate_paired_end_advanced_params: validate advanced params for paired end reads """ sequencing_tech = params.get('sequencing_tech') if sequencing_tech in ['PacBio CCS', 'PacBio CLR']: error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" ' error_msg += 'is Single End Reads specific' raise ValueError(error_msg) def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4())) handler_utils._mkdir_p(self.scratch) self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) self.uploader_utils = UploaderUtil(config) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) """ Update the workspace object related meta-data for staged file """ self.uploader_utils.update_staging_service( params.get('staging_file_subdir_path'), returnVal['obj_ref']) return returnVal def import_sra_from_web(self, params): ''' import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome required params: download_type: download type for web source fastq file ('Direct Download', 'FTP', 'DropBox', 'Google Drive') workspace_name: workspace name/ID of the object sra_urls_to_add: dict of SRA file URLs required params: file_url: SRA file URL sequencing_tech: sequencing technology name: output reads file name Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_web_params(params) download_type = params.get('download_type') workspace_name = params.get('workspace_name') obj_refs = [] uploaded_files = [] for sra_url_to_add in params.get('sra_urls_to_add'): download_web_file_params = { 'download_type': download_type, 'file_url': sra_url_to_add.get('file_url') } scratch_sra_file_path = self.dfu.download_web_file( download_web_file_params).get('copy_file_path') log('Downloaded web file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add) import_sra_reads_params = sra_url_to_add import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = workspace_name if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) obj_ref = self.ru.upload_reads(import_sra_reads_params).get( 'obj_ref') obj_refs.append(obj_ref) uploaded_files.append(sra_url_to_add.get('file_url')) return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files} def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability( params.get('staging_file_subdir_path')) def validate_import_sra_from_web_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in ['download_type', 'workspace_name', 'sra_urls_to_add']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) if not isinstance(params.get('sra_urls_to_add'), list): raise ValueError('sra_urls_to_add is not type list as required') for sra_url_to_add in params.get('sra_urls_to_add'): for p in ['file_url', 'sequencing_tech', 'name']: if p not in sra_url_to_add: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def generate_report(self, obj_refs_list, params): """ generate_report: generate summary report obj_refs: generated workspace object references. (return of import_sra_from_staging/web) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) objects_created = list() objects_data = list() for obj_ref in obj_refs_list: get_objects_params = { 'object_refs': [obj_ref], 'ignore_errors': False } objects_data.append(self.dfu.get_objects(get_objects_params)) objects_created.append({ 'ref': obj_ref, 'description': 'Imported Reads' }) output_html_files = self.generate_html_report(objects_data, params, uuid_string) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'objects_created': objects_created, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 460, 'report_object_name': 'kb_sra_upload_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def generate_html_report(self, reads_objs, params, uuid_string): """ _generate_html_report: generate html summary report """ log('Start generating html report') pprint(params) result_file_path = os.path.join(self.scratch, 'report.html') html_report = list() objects_content = '' for index, reads_obj in enumerate(reads_objs): idx = str(index) reads_data = reads_obj.get('data')[0].get('data') reads_info = reads_obj.get('data')[0].get('info') reads_ref = str(reads_info[6]) + '/' + str( reads_info[0]) + '/' + str(reads_info[4]) reads_obj_name = str(reads_info[1]) with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'), 'r') as object_content_file: report_template = object_content_file.read() report_template = report_template.replace('_NUM', str(idx)) report_template = report_template.replace( 'OBJECT_NAME', reads_obj_name) if index == 0: report_template = report_template.replace( 'panel-collapse collapse', 'panel-collapse collapse in') objects_content += report_template base_percentages = '' for key, val in reads_data.get('base_percentages').iteritems(): base_percentages += '{}({}%) '.format(key, val) reads_overview_data = collections.OrderedDict() reads_overview_data['Name'] = '{} ({})'.format( reads_obj_name, reads_ref) reads_overview_data['Uploaded File'] = params.get( 'uploaded_files')[index] reads_overview_data['Date Uploaded'] = time.strftime("%c") reads_overview_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_type = reads_info[2].lower() if 'single' in reads_type: reads_overview_data['Type'] = 'Single End' elif 'paired' in reads_type: reads_overview_data['Type'] = 'Paired End' else: reads_overview_data['Type'] = 'Unknown' reads_overview_data['Platform'] = reads_data.get( 'sequencing_tech', 'Unknown') reads_single_genome = str( reads_data.get('single_genome', 'Unknown')) if '0' in reads_single_genome: reads_overview_data['Single Genome'] = 'No' elif '1' in reads_single_genome: reads_overview_data['Single Genome'] = 'Yes' else: reads_overview_data['Single Genome'] = 'Unknown' insert_size_mean = params.get('insert_size_mean', 'Not Specified') if insert_size_mean is not None: reads_overview_data['Insert Size Mean'] = str(insert_size_mean) else: reads_overview_data['Insert Size Mean'] = 'Not Specified' insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified') if insert_size_std_dev is not None: reads_overview_data['Insert Size Std Dev'] = str( insert_size_std_dev) else: reads_overview_data['Insert Size Std Dev'] = 'Not Specified' reads_outward_orientation = str( reads_data.get('read_orientation_outward', 'Unknown')) if '0' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'No' elif '1' in reads_outward_orientation: reads_overview_data['Outward Read Orientation'] = 'Yes' else: reads_overview_data['Outward Read Orientation'] = 'Unknown' reads_stats_data = collections.OrderedDict() reads_stats_data['Number of Reads'] = '{:,}'.format( reads_data.get('read_count')) reads_stats_data['Total Number of Bases'] = '{:,}'.format( reads_data.get('total_bases')) reads_stats_data['Mean Read Length'] = str( reads_data.get('read_length_mean')) reads_stats_data['Read Length Std Dev'] = str( reads_data.get('read_length_stdev')) dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \ reads_data.get('read_count')) reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \ .format(str(reads_data.get('number_of_duplicates')), dup_reads_percent) reads_stats_data['Phred Type'] = str(reads_data.get('phred_type')) reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format( reads_data.get('qual_mean')) reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format( str(reads_data.get('qual_min')), str(reads_data.get('qual_max'))) reads_stats_data['GC Percentage'] = str( round(reads_data.get('gc_content') * 100, 2)) + '%' reads_stats_data['Base Percentages'] = base_percentages overview_content = '' for key, val in reads_overview_data.iteritems(): overview_content += '<tr><td><b>{}</b></td>'.format(key) overview_content += '<td>{}</td>'.format(val) overview_content += '</tr>' stats_content = '' for key, val in reads_stats_data.iteritems(): stats_content += '<tr><td><b>{}</b></td>'.format(key) stats_content += '<td>{}</td>'.format(val) stats_content += '</tr>' objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content) objects_content = objects_content.replace('###STATS_CONTENT###', stats_content) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '###TABLE_PANELS_CONTENT###', objects_content) result_file.write(report_template) result_file.close() shutil.copytree( os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'), os.path.join(self.scratch, 'bootstrap-3.3.7')) shutil.copy( os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'), os.path.join(self.scratch, 'jquery-3.2.1.min.js')) matched_files = [] for root, dirnames, filenames in os.walk(self.scratch): for filename in fnmatch.filter(filenames, '*.gz'): matched_files.append(os.path.join(root, filename)) for gz_file in matched_files: print('Removing ' + gz_file) os.remove(gz_file) report_shock_id = self.dfu.file_to_shock({ 'file_path': self.scratch, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Imported Assembly' }) return html_report
def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params): """ :param params: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Params" (KButil_Build_InSilico_Metagenomes_with_Grinder() ** ** Use Grinder to generate in silico shotgun metagenomes) -> structure: parameter "workspace_name" of type "workspace_name" (** The workspace object refs are of form: ** ** objects = ws.get_objects([{'ref': params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means the entire name combining the workspace id and the object name ** "id" is a numerical identifier of the workspace or object, and should just be used for workspace ** "name" is a string identifier of a workspace or object. This is received from Narrative.), parameter "input_refs" of type "data_obj_ref", parameter "output_name" of type "data_obj_name", parameter "desc" of String, parameter "num_reads_per_lib" of Long, parameter "population_percs" of String, parameter "read_len_mean" of Long, parameter "read_len_stddev" of Double, parameter "pairs_flag" of Long, parameter "mate_orientation" of String, parameter "insert_len_mean" of Long, parameter "insert_len_stddev" of Double, parameter "mutation_dist" of String, parameter "mutation_ratio" of String, parameter "qual_good" of Long, parameter "qual_bad" of Long, parameter "len_bias_flag" of Long, parameter "random_seed" of Long :returns: instance of type "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" -> structure: parameter "report_name" of type "data_obj_name", parameter "report_ref" of type "data_obj_ref" """ # ctx is the context object # return variables are: returnVal #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder #### STEP 0: basic init ## console = [] invalid_msgs = [] report_text = '' self.log(console, 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ') self.log(console, "\n" + pformat(params)) # Auth token = ctx['token'] headers = {'Authorization': 'OAuth ' + token} env = os.environ.copy() env['KB_AUTH_TOKEN'] = token # API Clients #SERVICE_VER = 'dev' # DEBUG SERVICE_VER = 'release' wsClient = workspaceService(self.workspaceURL, token=token) readsUtils_Client = ReadsUtils(url=self.callbackURL, token=ctx['token']) # SDK local #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token']) # for SDK local. local doesn't work for SetAPI setAPI_Client = SetAPI(url=self.serviceWizardURL, token=ctx['token']) # for dynamic service auClient = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) dfu = DFUClient(self.callbackURL) # param checks required_params = [ 'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib', 'population_percs', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'mate_orientation', 'insert_len_mean', 'insert_len_stddev', 'mutation_dist', 'mutation_ratio', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in required_params: if arg not in params or params[arg] == None or params[arg] == '': raise ValueError("Must define required param: '" + arg + "'") # cast to str unpredictable numerical params (mostly used in string context) numerical_params = [ 'num_reads_per_lib', 'read_len_mean', 'read_len_stddev', 'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed' ] for arg in numerical_params: if arg not in params or params[arg] == None or params[arg] == '': continue params[arg] = str(params[arg]) # load provenance provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects'] = [] for input_ref in params['input_refs']: provenance[0]['input_ws_objects'].append(input_ref) # set the output paths timestamp = int( (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000) output_dir = os.path.join(self.scratch, 'output.' + str(timestamp)) if not os.path.exists(output_dir): os.makedirs(output_dir) html_output_dir = os.path.join(output_dir, 'html') if not os.path.exists(html_output_dir): os.makedirs(html_output_dir) #### STEP 1: Parse population_percs and write to file ## abundance_str = params['population_percs'].strip() abundance_file_path = os.path.join(output_dir, 'my_abundances.txt') abundance_config_num_libs = 0 abundance_config_num_libs_set = False grinder_genome_ids = [] header = [] out_buf = [] for row in abundance_str.split("\n"): cols = re.split(r'\s+', row) if cols[0].upper() == "GENOME": for col in cols: if col == '': continue header.append(col) continue grinder_genome_ids.append(cols[0]) self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'") # DEBUG out_row = [] for col in cols: if col == '': continue elif col == '%': continue elif col.endswith('%'): col = col.rstrip('%') out_row.append(col) out_buf.append("\t".join(out_row)) num_samples = len(out_row) - 1 # first col is genome id if not abundance_config_num_libs_set: abundance_config_num_libs_set = True abundance_config_num_libs = num_samples elif num_samples != abundance_config_num_libs: invalid_msgs.append( "inconsistent number of samples in population_percs input field" ) # data validation if abundance_config_num_libs == 0: invalid_msgs.append( "unable to find sample percentages in population_percs input field" ) sample_sums = [] for row_i, abund_row_str in enumerate(out_buf): abund_row = abund_row_str.split() for sample_i, abund in enumerate(abund_row[1:]): if row_i == 0: sample_sums.append(0) #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i)) # DEBUG sample_sums[sample_i] += float(abund) for sample_i, sample_sum in enumerate(sample_sums): if sample_sum < 99.5 or sample_sum > 100.5: self.log( invalid_msgs, "Sample: " + str(sample_i + 1) + " " + header[sample_i + 1] + " proportions is not summing to 100.0. Summing to: " + str(sample_sum)) if len(invalid_msgs) == 0: with open(abundance_file_path, 'w') as abundance_fh: for out_line in out_buf: abundance_fh.write(out_line + "\n") # DEBUG with open(abundance_file_path, 'r') as abundance_fh: for out_line in abundance_fh.readlines(): out_line = out_line.rstrip() self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'") #### STEP 2: get genome scaffold sequences ## if len(invalid_msgs) == 0: genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna') read_buf_size = 65536 write_buf_size = 65536 accepted_input_types = ["KBaseGenomes.Genome"] genome_refs = params['input_refs'] genome_obj_names = [] genome_sci_names = [] assembly_refs = [] for i, input_ref in enumerate(genome_refs): # genome obj info try: [ OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I ] = range(11) # object_info tuple input_obj_info = wsClient.get_object_info_new( {'objects': [{ 'ref': input_ref }]})[0] input_obj_type = re.sub( '-[0-9]+\.[0-9]+$', "", input_obj_info[TYPE_I]) # remove trailing version genome_obj_names.append(input_obj_info[NAME_I]) except Exception as e: raise ValueError('Unable to get object from workspace: (' + input_ref + ')' + str(e)) if input_obj_type not in accepted_input_types: raise ValueError("Input object of type '" + input_obj_type + "' not accepted. Must be one of " + ", ".join(accepted_input_types)) # genome obj data try: genome_obj = wsClient.get_objects([{ 'ref': input_ref }])[0]['data'] genome_sci_names.append(genome_obj['scientific_name']) except: raise ValueError("unable to fetch genome: " + input_ref) # Get assembly_refs if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \ and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None): msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " MISSING BOTH contigset_ref AND assembly_ref. Cannot process. Exiting." self.log(console, msg) self.log(invalid_msgs, msg) continue elif 'assembly_ref' in genome_obj and genome_obj[ 'assembly_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING assembly_ref: " + str( genome_obj['assembly_ref']) self.log(console, msg) assembly_refs.append(genome_obj['assembly_ref']) elif 'contigset_ref' in genome_obj and genome_obj[ 'contigset_ref'] != None: msg = "Genome " + genome_obj_names[ i] + " (ref:" + input_ref + ") " + genome_sci_names[ i] + " USING contigset_ref: " + str( genome_obj['contigset_ref']) self.log(console, msg) assembly_refs.append(genome_obj['contigset_ref']) # get fastas for scaffolds if len(invalid_msgs) == 0: contig_file_paths = [] for genome_i, input_ref in enumerate(genome_refs): contig_file = auClient.get_assembly_as_fasta({ 'ref': assembly_refs[genome_i] }).get('path') sys.stdout.flush() contig_file_path = dfu.unpack_file({'file_path': contig_file})['file_path'] contig_file_paths.append(contig_file_path) # reformat FASTA IDs for Grinder with open(genomes_src_db_file_path, 'w', write_buf_size) as genomes_src_db_fh: for genome_i, contig_file_path in enumerate(contig_file_paths): #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path) # DEBUG #contig_ids = [] with open(contig_file_path, 'r', read_buf_size) as contig_fh: genome_seq = '' contig_seq = '' contig_seqs = [] for contig_line in contig_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): #contig_id = contig_line.strip()[1:].split(' ')[0] #contig_ids.append(contig_id) #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n") if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' continue else: #genomes_src_db_fh.write(contig_line) contig_seq += contig_line if contig_seq != '': contig_seqs.append(contig_seq) contig_seq = '' # write joined contigs to file genome_seq = "NNNNNNNNNN".join( contig_seqs ) # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins genome_seq = genome_seq.upper( ) # grinder might require upper case? genomes_src_db_fh.write(">" + grinder_genome_ids[genome_i] + "\n") genomes_src_db_fh.write(genome_seq + "\n") genome_seq = '' contig_seqs = [] # DEBUG #for contig_id in contig_ids: # self.log(console, "\tCONTIG_ID: "+contig_id) # DEBUG # DEBUG toggle = 0 with open(genomes_src_db_file_path, 'r', write_buf_size) as genomes_src_db_fh: for contig_line in genomes_src_db_fh.readlines(): contig_line = contig_line.rstrip() if contig_line.startswith('>'): self.log(console, 'GENOMES_SRC_DB: ' + contig_line) genome_id = contig_line[1:] toggle = 0 elif toggle == 0: #elif genome_id == 'G3': self.log( console, 'GENOMES_SRC_DB: ' + contig_line[0:50] + '...') toggle += 1 #### STEP 3: Run Grinder ## if len(invalid_msgs) == 0: cmd = [] cmd.append(self.GRINDER) # output cmd.append('-base_name') cmd.append(params['output_name']) cmd.append('-output_dir') cmd.append(output_dir) # contigs input cmd.append('-reference_file') cmd.append(genomes_src_db_file_path) # abundances cmd.append('-abundance_file') cmd.append(abundance_file_path) # library size cmd.append('-total_reads') cmd.append(str(params['num_reads_per_lib'])) # num libraries (overridden by abundance file?) cmd.append('-num_libraries') cmd.append(str(abundance_config_num_libs)) # read and insert lens cmd.append('-read_dist') cmd.append(str(params['read_len_mean'])) cmd.append('normal') cmd.append(str(params['read_len_stddev'])) if str(params['pairs_flag']) == '1': cmd.append('-insert_dist') cmd.append(str(params['insert_len_mean'])) cmd.append('normal') cmd.append(str(params['insert_len_stddev'])) # mate orientation cmd.append('-mate_orientation') cmd.append(params['mate_orientation']) # genome len bias cmd.append('-length_bias') cmd.append(str(params['len_bias_flag'])) # mutation model cmd.append('-mutation_dist') cmd.append(str(params['mutation_dist'])) cmd.append('-mutation_ratio') cmd.append(str(params['mutation_ratio'])) # qual scores cmd.append('-fastq_output') cmd.append('1') cmd.append('-qual_levels') cmd.append(str(params['qual_good'])) cmd.append(str(params['qual_bad'])) # skip contig joins cmd.append('-exclude_chars') cmd.append('NX') # explicitly request bidirectional cmd.append('-unidirectional') cmd.append('0') # random seed if 'random_seed' in params and params[ 'random_seed'] != None and params['random_seed'] != '': cmd.append('-random_seed') cmd.append(str(params['random_seed'])) # RUN cmd_str = " ".join(cmd) self.log(console, "===========================================") self.log(console, "RUNNING: " + cmd_str) self.log(console, "===========================================") cmdProcess = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) outputlines = [] while True: line = cmdProcess.stdout.readline() outputlines.append(line) if not line: break self.log(console, line.replace('\n', '')) cmdProcess.stdout.close() cmdProcess.wait() self.log(console, 'return code: ' + str(cmdProcess.returncode) + '\n') if cmdProcess.returncode != 0: raise ValueError('Error running kb_grinder, return code: ' + str(cmdProcess.returncode) + '\n') #report_text += "\n".join(outputlines) #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr # capture output for report and paths to out files report_text_buf = [] struct_file_paths = [] struct_file_names = [] fastq_file_paths = [] for out_line in outputlines: out_line = out_line.rstrip() if 'Community structure' in out_line: clean_line = out_line.lstrip() struct_file_path = re.split(r'\s+', clean_line)[3] struct_file_paths.append(struct_file_path) struct_file_names.append(struct_file_path.split('/')[-1]) self.log(console, "STRUCT_FILE_NAME: '" + struct_file_path.split('/')[-1]) # DEBUG elif 'FASTQ file' in out_line: clean_line = out_line.lstrip() fastq_file_paths.append(re.split(r'\s+', clean_line)[3]) else: report_text_buf.append(out_line) report_text += "\n".join(report_text_buf) #### STEP 4: Upload Read Libs and create reads set ## if len(invalid_msgs) == 0: lib_obj_refs = [] lib_obj_names = [] readsSet_items = [] for sample_i, fastq_file_path in enumerate(fastq_file_paths): if not os.path.isfile (fastq_file_path) \ or os.path.getsize (fastq_file_path) == 0: raise ValueError("empty read lib generated: " + fastq_file_path) else: # lib obj name if len(fastq_file_paths) == 0: output_obj_name = params['output_name'] else: if str(params['pairs_flag']) == '1': output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".PairedEndLib" else: output_obj_name = params[ 'output_name'] + '-sample' + str( sample_i + 1) + ".SingleEndLib" lib_obj_names.append(output_obj_name) # upload lib and get obj ref self.log( console, 'Uploading trimmed paired reads: ' + output_obj_name) sequencing_tech = 'artificial reads' if str(params['pairs_flag']) == '1': interleaved = 1 else: interleaved = 0 lib_obj_ref = readsUtils_Client.upload_reads({ 'wsname': str(params['workspace_name']), 'name': output_obj_name, 'fwd_file': fastq_file_path, 'interleaved': interleaved, 'sequencing_tech': sequencing_tech })['obj_ref'] lib_obj_refs.append(lib_obj_ref) os.remove(fastq_file_path) # free up disk # add to readsSet readsSet_items.append({ 'ref': lib_obj_ref, 'label': output_obj_name }) # create readsset readsSet_obj_ref = None if len(lib_obj_refs) > 1: readsSet_obj = { 'description': "Grinder Metagenome from " + " ".join(genome_obj_names), 'items': readsSet_items } readsSet_obj_name = params['output_name'] readsSet_obj_ref = setAPI_Client.save_reads_set_v1({ 'workspace_name': params['workspace_name'], 'output_object_name': readsSet_obj_name, 'data': readsSet_obj })['set_ref'] #### STEP 5: Build report ## reportName = 'kb_grinder_report_' + str(uuid.uuid4()) reportObj = { 'objects_created': [], #'text_message': '', # or is it 'message'? 'message': '', # or is it 'text_message'? 'direct_html': '', #'direct_html_link_index': 0, 'file_links': [], 'html_links': [], 'workspace_name': params['workspace_name'], 'report_object_name': reportName } # message if len(invalid_msgs) > 0: report_text = "\n".join(invalid_msgs) reportObj['message'] = report_text if len(invalid_msgs) == 0: # objs if readsSet_obj_ref != None: reportObj['objects_created'].append({ 'ref': readsSet_obj_ref, 'desc': params['output_name'] + " ReadsSet" }) for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs): reportObj['objects_created'].append({ 'ref': lib_obj_ref, 'desc': lib_obj_names[lib_obj_i] }) # downloadable data for data_i, data_path in enumerate(struct_file_paths): try: upload_ret = dfu.file_to_shock({ 'file_path': data_path, #'pack': 'zip'}) 'make_handle': 0 }) except: raise ValueError('error uploading ' + data_path + ' file to shock') reportObj['file_links'].append({ 'shock_id': upload_ret['shock_id'], 'name': struct_file_names[data_i], 'label': struct_file_names[data_i] }) # html report """ try: html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir, 'make_handle': 0, 'pack': 'zip'}) except: raise ValueError ('error uploading html report to shock') reportObj['direct_html_link_index'] = 0 reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'], 'name': html_file, 'label': params['output_name']+' HTML' } ] """ # save report object # SERVICE_VER = 'release' reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER) #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']}) report_info = reportClient.create_extended_report(reportObj) returnVal = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] } #END KButil_Build_InSilico_Metagenomes_with_Grinder # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( 'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
class ImportSRAUtil: SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump' def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _run_command(self, command): """ _run_command: run command and print result """ log('Start executing command:\n{}'.format(command)) pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) output = pipe.communicate()[0] exitCode = pipe.returncode if (exitCode == 0): log('Executed commend:\n{}\n'.format(command) + 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)) else: error_msg = 'Error running commend:\n{}\n'.format(command) error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output) raise ValueError(error_msg) def _check_fastq_dump_result(self, tmp_dir, sra_name): """ _check_fastq_dump_result: check fastq_dump result is PE or SE """ return os.path.exists(tmp_dir + '/' + sra_name + '/1') def _sra_to_fastq(self, scratch_sra_file_path): """ _sra_to_fastq: convert SRA file to FASTQ file(s) """ tmp_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(tmp_dir) command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O ' command += tmp_dir + ' ' + scratch_sra_file_path self._run_command(command) sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0] paired_end = self._check_fastq_dump_result(tmp_dir, sra_name) if paired_end: fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq') os.rename(rev_file, rev_file + '.fastq') rev_file = rev_file + '.fastq' else: fwd_file = os.path.join(tmp_dir, sra_name, 'fastq') os.rename(fwd_file, fwd_file + '.fastq') fwd_file = fwd_file + '.fastq' rev_file = None fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file} return fastq_file_path def _validate_upload_staging_file_availability(self, staging_file_subdir_path): """ _validate_upload_file_path_availability: validates file availability in user's staging area """ pass # TODO ftp_server needs to be fixed for subdir # list = ftp_service(self.callback_url).list_files() # if staging_file_subdir_path not in list: # error_msg = 'Target file: {} is NOT available.\n'.format( # staging_file_subdir_path.rpartition('/')[-1]) # error_msg += 'Available files:\n {}'.format("\n".join(list)) # raise ValueError(error_msg) def __init__(self, config): self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.ru = ReadsUtils(self.callback_url) def import_sra_from_staging(self, params): ''' import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome required params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name sequencing_tech: sequencing technology name: output reads file name workspace_name: workspace name/ID of the object Optional Params: single_genome: whether the reads are from a single genome or a metagenome. insert_size_mean: mean (average) insert length insert_size_std_dev: standard deviation of insert lengths read_orientation_outward: whether reads in a pair point outward return: obj_ref: return object reference ''' log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self.validate_import_sra_from_staging_params(params) download_staging_file_params = { 'staging_file_subdir_path': params.get('staging_file_subdir_path') } scratch_sra_file_path = self.dfu.download_staging_file( download_staging_file_params).get('copy_file_path') log('Downloaded staging file to: {}'.format(scratch_sra_file_path)) fastq_file_path = self._sra_to_fastq(scratch_sra_file_path) import_sra_reads_params = params import_sra_reads_params.update(fastq_file_path) workspace_name_or_id = params.get('workspace_name') if str(workspace_name_or_id).isdigit(): import_sra_reads_params['wsid'] = int(workspace_name_or_id) else: import_sra_reads_params['wsname'] = str(workspace_name_or_id) log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format( json.dumps(import_sra_reads_params, indent=1))) returnVal = self.ru.upload_reads(import_sra_reads_params) return returnVal def validate_import_sra_from_staging_params(self, params): """ validate_import_genbank_from_staging_params: validates params passed to import_genbank_from_staging method """ # check for required parameters for p in [ 'staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name' ]: if p not in params: raise ValueError('"' + p + '" parameter is required, but missing') self._validate_upload_staging_file_availability( params.get('staging_file_subdir_path')) def generate_report(self, obj_ref, params): """ generate_report: generate summary report obj_ref: generated workspace object references. (return of import_sra_from_staging) params: staging_file_subdir_path: subdirectory file path e.g. for file: /data/bulk/user_name/file_name staging_file_subdir_path is file_name for file: /data/bulk/user_name/subdir_1/subdir_2/file_name staging_file_subdir_path is subdir_1/subdir_2/file_name workspace_name: workspace name/ID that reads will be stored to """ uuid_string = str(uuid.uuid4()) upload_message = 'Import Finished\n' get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False} object_data = self.dfu.get_objects(get_objects_params) number_of_reads = object_data.get('data')[0].get('data').get( 'read_count') upload_message += "Reads Name: " upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n' upload_message += 'Imported Reads File: {}\n'.format( params.get('staging_file_subdir_path')) if isinstance(number_of_reads, (int, long)): upload_message += 'Number of Reads: {:,}\n'.format(number_of_reads) report_params = { 'message': upload_message, 'workspace_name': params.get('workspace_name'), 'report_object_name': 'kb_upload_mothods_report_' + uuid_string } kbase_report_client = KBaseReport(self.callback_url, token=self.token) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output