ref_id , fasta_file = rnaseq_util.get_fa_from_genome(logger,ws_client,self.urls,params['ws_id'],hisat2_dir,params['genome_id']) ### Build Index for the fasta file hisat2base = os.path.basename(fasta_file) #hisat2base =os.path.join(hisat2_dir,handler_util.get_file_with_suffix(hisat2_dir,".fa")) hisat2base_cmd = '{0} {1}'.format(fasta_file,hisat2base) try: logger.info("Building Index for Hisat2 {0}".format(hisat2base_cmd)) cmdline_output = script_util.runProgram(logger,"hisat2-build",hisat2base_cmd,None,hisat2_dir) except Exception,e: raise Exception("Failed to run command {0}".format(hisat2base_cmd)) ### Check if GTF object exists in the workspace pull the gtf ws_gtf = params['genome_id']+"_GTF" gtf_file = script_util.check_and_download_existing_handle_obj(logger,ws_client,self.urls,params['ws_id'],ws_gtf,"KBaseRNASeq.GFFAnnotation",hisat2_dir,token) if gtf_file is None: rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],ref_id,params['genome_id'],hisat2_dir,token) # Determine the num_threads provided by the user otherwise default the number of threads to 2 logger.info(" Number of threads used by each process {0}".format(self.num_threads)) task_param = {'job_id' : params['sampleset_id'], 'label' : r_label, 'ws_id' : params['ws_id'], 'reads_type' : sample_type, 'hisat2_dir' : self.directory, 'annotation_id': ref_id, 'sampleset_id' : None } self.task_list.append(task_param) def collect(self) : # do with
except Exception, e: raise Exception( "Failed to run command {0}".format(bowtie2base_cmd)) ### Check if GTF object exists in the workspace pull the gtf ref_id = bowtie_index['data']['genome_id'] genome_name = ws_client.get_object_info_new( {"objects": [{ 'ref': ref_id }]})[0][1] ws_gtf = genome_name + "_GTF" gtf_file = script_util.check_and_download_existing_handle_obj( logger, ws_client, self.urls, params['ws_id'], ws_gtf, "KBaseRNASeq.GFFAnnotation", bowtie2_dir, token) if gtf_file is None: rnaseq_util.create_gtf_annotation_from_genome( logger, ws_client, hs, self.urls, params['ws_id'], ref_id, genome_name, bowtie2_dir, token) count = 0 logger.info(" Number of threads used by each process {0}".format( self.num_threads)) for i in reads: try: label = r_label[count] task_param = { 'job_id': i, 'label': r_label[count], 'ws_id': params['ws_id'], 'bowtie2_dir': self.directory, 'annotation_id': ref_id, # Changed annotation_id to ref_id for genome object
ws_gtf = annotation_gtf+"_GTF_Annotation" ret = script_util.if_obj_exists(None,ws_client,params['ws_id'],"KBaseRNASeq.GFFAnnotation",[ws_gtf]) if not ret is None: logger.info("GFF Annotation Exist for Genome Annotation {0}.... Skipping step ".format(annotation_gtf)) annot_name,annot_id = ret[0] gtf_obj=ws_client.get_objects([{'ref' : annot_id}])[0] gtf_id=gtf_obj['data']['handle']['id'] gtf_name=gtf_obj['data']['handle']['file_name'] try: script_util.download_file_from_shock(logger, shock_service_url=self.urls['shock_service_url'], shock_id=gtf_id,filename=gtf_name, directory=tophat_dir,token=token) gtf_file = os.path.join(tophat_dir,gtf_name) except Exception,e: logger.exception(e) raise Exception( "Unable to download shock file, {0}".format(gtf_name)) else: gtf_file =rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,annotation_gtf,tophat_dir,token) # Determine the num_threads provided by the user otherwise default the number of threads to 2 self.num_jobs = 1 logger.info(" Number of threads used by each process {0}".format(self.num_threads)) task_param = {'job_id' : params['sampleset_id'], 'label' : 'Single-Sample', 'ws_id' : params['ws_id'], 'reads_type' : sample_type, 'tophat_dir' : self.directory, 'gtf_file': gtf_file, 'annotation_id': genome_id, 'sampleset_id' : None } self.task_list.append(task_param)
class StringTieSampleSet(StringTie): def __init__(self, logger, directory, urls, max_cores): pprint(self.__class__) super(self.__class__, self).__init__(logger, directory, urls, max_cores) #super(StringtTieSampleSet, self).__init__(logger, directory, urls) # user defined shared variables across methods #self.sample_info = None self.alignmentset_info = None self.num_threads = 1 def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] stringtie_dir = self.directory try: a_sampleset = ws_client.get_objects([{ 'name': params['alignmentset_id'], 'workspace': params['ws_id'] }])[0] a_sampleset_info = ws_client.get_object_info_new({ "objects": [{ 'name': params['alignmentset_id'], 'workspace': params['ws_id'] }] })[0] self.alignmentset_info = a_sampleset_info a_sampleset_id = str(a_sampleset_info[6]) + '/' + str( a_sampleset_info[0]) + '/' + str(a_sampleset_info[4]) alignmentset_id = a_sampleset_id except Exception, e: logger.exception("".join(traceback.format_exc())) raise Exception("Error Downloading objects from the workspace ") #read_sample_id'] ### Check if the gtf file exists in the workspace. if exists download the file from that genome_id = a_sampleset['data']['genome_id'] genome_name = ws_client.get_object_info([{ "ref": genome_id }], includeMetadata=None)[0][1] ws_gtf = genome_name + "_GTF_Annotation" gtf_file = script_util.check_and_download_existing_handle_obj( logger, ws_client, self.urls, params['ws_id'], ws_gtf, "KBaseRNASeq.GFFAnnotation", stringtie_dir, token) if gtf_file is None: rnaseq_util.create_gtf_annotation_from_genome( logger, ws_client, hs, self.urls, params['ws_id'], genome_id, genome_name, stringtie_dir, token) gtf_info = ws_client.get_object_info_new( {"objects": [{ 'name': ws_gtf, 'workspace': params['ws_id'] }]})[0] gtf_id = str(gtf_info[6]) + '/' + str(gtf_info[0]) + '/' + str( gtf_info[4]) self.tool_opts = { k: str(v) for k, v in params.iteritems() if not k in ('ws_id', 'alignmentset_id', 'num_threads') and v is not None } alignment_ids = a_sampleset['data']['sample_alignments'] m_alignment_names = a_sampleset['data']['mapped_rnaseq_alignments'] self.sampleset_id = a_sampleset['data']['sampleset_id'] ### Get List of Alignments Names self.align_names = [] for d_align in m_alignment_names: for i, j in d_align.items(): self.align_names.append(j) m_alignment_ids = a_sampleset['data']['mapped_alignments_ids'] self.num_jobs = len(alignment_ids) if self.num_jobs < 2: raise ValueError( "Please ensure you have atleast 2 alignments to run Stringtie in Set mode" ) logger.info(" Number of threads used by each process {0}".format( self.num_threads)) count = 0 for i in m_alignment_ids: for sample_name, alignment_id in i.items(): task_param = { 'job_id': alignment_id, 'gtf_file': gtf_file, 'ws_id': params['ws_id'], 'genome_id': genome_id, 'stringtie_dir': self.directory, 'annotation_id': gtf_id, 'sample_id': sample_name, 'alignmentset_id': alignmentset_id } self.task_list.append(task_param) count = count + 1
script_util.move_files(logger,mv_dir,tophat_dir) except Exception, e: logger.error("".join(traceback.format_exc())) raise Exception("Unzip indexfile error") fasta_file =os.path.join(tophat_dir,(handler_util.get_file_with_suffix(tophat_dir,".fa")+".fa")) bowtie2base =os.path.join(tophat_dir,handler_util.get_file_with_suffix(tophat_dir,".rev.1.bt2")) ### Check if GTF annotation object exist or skip this step ### Check if the gtf object exists in the workspace ### Only run create_gtf_annotation if object doesnt exist ws_gtf = annotation_gtf+"_GTF_Annotation" genome_name = script_util.ws_get_obj_name( logger, ws_client, params['ws_id'], genome_id ) gtf_file = script_util.check_and_download_existing_handle_obj(logger,ws_client,self.urls,params['ws_id'],ws_gtf,"KBaseRNASeq.GFFAnnotation",tophat_dir,token) if gtf_file is None: gtf_file = rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,genome_name,tophat_dir,token) #ret = script_util.if_obj_exists(None,ws_client,params['ws_id'],"KBaseRNASeq.GFFAnnotation",[ws_gtf]) # this line should be safe from reference #if not ret is None: # logger.info("GFF Annotation Exist for Genome Annotation {0}.... Skipping step ".format(annotation_gtf)) # annot_name,annot_id = ret[0] # gtf_obj=ws_client.get_objects([{'ref' : annot_id}])[0] # gtf_id=gtf_obj['data']['handle']['id'] # gtf_name=gtf_obj['data']['handle']['file_name'] # try: # script_util.download_file_from_shock(logger, shock_service_url=self.urls['shock_service_url'], shock_id=gtf_id,filename=gtf_name, directory=tophat_dir,token=token) # gtf_file = os.path.join(tophat_dir,gtf_name) # except Exception,e: # logger.exception(e) # raise Exception( "Unable to download shock file, {0}".format(gtf_name)) #else: # gtf_file =rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,annotation_gtf,tophat_dir,token)
class StringTieSample(StringTie): def __init__(self, logger, directory, urls, max_cores): super(self.__class__, self).__init__(logger, directory, urls, max_cores) # user defined shared variables across methods self.alignment_info = None #self.sampleset_info = None self.num_threads = 1 def prepare(self): # for quick testing, we recover parameters here ws_client = self.common_params['ws_client'] hs = self.common_params['hs_client'] params = self.method_params logger = self.logger token = self.common_params['user_token'] stringtie_dir = self.directory try: a_sample = ws_client.get_objects([{ 'name': params['alignmentset_id'], 'workspace': params['ws_id'] }])[0] a_alignment_info = ws_client.get_object_info_new({ "objects": [{ 'name': params['alignmentset_id'], 'workspace': params['ws_id'] }] })[0] self.alignment_info = a_alignment_info s_alignment_id = str(a_alignment_info[6]) + '/' + str( a_alignment_info[0]) + '/' + str(a_alignment_info[4]) except Exception, e: logger.exception("".join(traceback.format_exc())) raise Exception("Error Downloading objects from the workspace ") read_sample_id = a_sample['data']['read_sample_id'] ### Check if the gtf file exists in the workspace. if exists download the file from that genome_id = a_sample['data']['genome_id'] genome_name = ws_client.get_object_info([{ "ref": genome_id }], includeMetadata=None)[0][1] ws_gtf = genome_name + "_GTF_Annotation" gtf_file = script_util.check_and_download_existing_handle_obj( logger, ws_client, self.urls, params['ws_id'], ws_gtf, "KBaseRNASeq.GFFAnnotation", stringtie_dir, token) if gtf_file is None: rnaseq_util.create_gtf_annotation_from_genome( logger, ws_client, hs, self.urls, params['ws_id'], genome_id, genome_name, stringtie_dir, token) gtf_info = ws_client.get_object_info_new( {"objects": [{ 'name': ws_gtf, 'workspace': params['ws_id'] }]})[0] gtf_id = str(gtf_info[6]) + '/' + str(gtf_info[0]) + '/' + str( gtf_info[4]) self.tool_opts = { k: str(v) for k, v in params.iteritems() if not k in ('ws_id', 'alignmentset_id', 'num_threads') and v is not None } self.num_jobs = 1 logger.info(" Number of threads used by each process {0}".format( self.num_threads)) task_param = { 'job_id': s_alignment_id, 'gtf_file': gtf_file, 'ws_id': params['ws_id'], 'genome_id': genome_id, 'stringtie_dir': self.directory, 'annotation_id': gtf_id, 'sample_id': read_sample_id, 'alignmentset_id': None } self.task_list.append(task_param)