Exemple #1
0
	ref_id , fasta_file =  rnaseq_util.get_fa_from_genome(logger,ws_client,self.urls,params['ws_id'],hisat2_dir,params['genome_id'])

	### Build Index for the fasta file
	hisat2base = os.path.basename(fasta_file) 
        #hisat2base =os.path.join(hisat2_dir,handler_util.get_file_with_suffix(hisat2_dir,".fa"))
        hisat2base_cmd = '{0} {1}'.format(fasta_file,hisat2base)
	try:
            logger.info("Building Index for Hisat2 {0}".format(hisat2base_cmd))
            cmdline_output = script_util.runProgram(logger,"hisat2-build",hisat2base_cmd,None,hisat2_dir)
        except Exception,e:
            raise Exception("Failed to run command {0}".format(hisat2base_cmd))
        ### Check if GTF object exists in the workspace pull the gtf
	ws_gtf = params['genome_id']+"_GTF"
	gtf_file = script_util.check_and_download_existing_handle_obj(logger,ws_client,self.urls,params['ws_id'],ws_gtf,"KBaseRNASeq.GFFAnnotation",hisat2_dir,token)
        if gtf_file is None:
             rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],ref_id,params['genome_id'],hisat2_dir,token)
	# Determine the num_threads provided by the user otherwise default the number of threads to 2
        logger.info(" Number of threads used by each process {0}".format(self.num_threads))
	task_param = {'job_id' : params['sampleset_id'],
                      'label' : r_label,
                      'ws_id' : params['ws_id'],
                      'reads_type' : sample_type,
                      'hisat2_dir' : self.directory,
                      'annotation_id': ref_id,
                      'sampleset_id' : None
                      }
	self.task_list.append(task_param)
	
		
    def collect(self) :
        # do with 
Exemple #2
0
        except Exception, e:
            raise Exception(
                "Failed to run command {0}".format(bowtie2base_cmd))
        ### Check if GTF object exists in the workspace pull the gtf
        ref_id = bowtie_index['data']['genome_id']
        genome_name = ws_client.get_object_info_new(
            {"objects": [{
                'ref': ref_id
            }]})[0][1]
        ws_gtf = genome_name + "_GTF"
        gtf_file = script_util.check_and_download_existing_handle_obj(
            logger, ws_client, self.urls, params['ws_id'], ws_gtf,
            "KBaseRNASeq.GFFAnnotation", bowtie2_dir, token)
        if gtf_file is None:
            rnaseq_util.create_gtf_annotation_from_genome(
                logger, ws_client, hs, self.urls, params['ws_id'], ref_id,
                genome_name, bowtie2_dir, token)

        count = 0
        logger.info(" Number of threads used by each process {0}".format(
            self.num_threads))
        for i in reads:
            try:
                label = r_label[count]
                task_param = {
                    'job_id': i,
                    'label': r_label[count],
                    'ws_id': params['ws_id'],
                    'bowtie2_dir': self.directory,
                    'annotation_id':
                    ref_id,  # Changed annotation_id to ref_id for genome object 
	ws_gtf = annotation_gtf+"_GTF_Annotation"
	ret = script_util.if_obj_exists(None,ws_client,params['ws_id'],"KBaseRNASeq.GFFAnnotation",[ws_gtf])
        if not ret is None:
            logger.info("GFF Annotation Exist for Genome Annotation {0}.... Skipping step ".format(annotation_gtf))
	    annot_name,annot_id = ret[0]
            gtf_obj=ws_client.get_objects([{'ref' : annot_id}])[0]
            gtf_id=gtf_obj['data']['handle']['id']
            gtf_name=gtf_obj['data']['handle']['file_name']
            try:
               script_util.download_file_from_shock(logger, shock_service_url=self.urls['shock_service_url'], shock_id=gtf_id,filename=gtf_name, directory=tophat_dir,token=token)
               gtf_file = os.path.join(tophat_dir,gtf_name)
            except Exception,e:
	       logger.exception(e)
               raise Exception( "Unable to download shock file, {0}".format(gtf_name))  
 	else:		
	    gtf_file =rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,annotation_gtf,tophat_dir,token)		
	# Determine the num_threads provided by the user otherwise default the number of threads to 2
        self.num_jobs =  1

        logger.info(" Number of threads used by each process {0}".format(self.num_threads))
	task_param = {'job_id' : params['sampleset_id'],
                      'label' : 'Single-Sample',
                      'ws_id' : params['ws_id'],
                      'reads_type' : sample_type,
                      'tophat_dir' : self.directory,
                      'gtf_file': gtf_file,
                      'annotation_id': genome_id,
                      'sampleset_id' : None
                      }
	self.task_list.append(task_param)
	
class StringTieSampleSet(StringTie):
    def __init__(self, logger, directory, urls, max_cores):
        pprint(self.__class__)
        super(self.__class__, self).__init__(logger, directory, urls,
                                             max_cores)
        #super(StringtTieSampleSet, self).__init__(logger, directory, urls)
        # user defined shared variables across methods
        #self.sample_info = None
        self.alignmentset_info = None
        self.num_threads = 1

    def prepare(self):
        # for quick testing, we recover parameters here
        ws_client = self.common_params['ws_client']
        hs = self.common_params['hs_client']
        params = self.method_params
        logger = self.logger
        token = self.common_params['user_token']
        stringtie_dir = self.directory
        try:
            a_sampleset = ws_client.get_objects([{
                'name':
                params['alignmentset_id'],
                'workspace':
                params['ws_id']
            }])[0]
            a_sampleset_info = ws_client.get_object_info_new({
                "objects": [{
                    'name': params['alignmentset_id'],
                    'workspace': params['ws_id']
                }]
            })[0]
            self.alignmentset_info = a_sampleset_info
            a_sampleset_id = str(a_sampleset_info[6]) + '/' + str(
                a_sampleset_info[0]) + '/' + str(a_sampleset_info[4])
            alignmentset_id = a_sampleset_id
        except Exception, e:
            logger.exception("".join(traceback.format_exc()))
            raise Exception("Error Downloading objects from the workspace ")
        #read_sample_id']
        ### Check if the gtf file exists in the workspace. if exists download the file from that
        genome_id = a_sampleset['data']['genome_id']
        genome_name = ws_client.get_object_info([{
            "ref": genome_id
        }],
                                                includeMetadata=None)[0][1]
        ws_gtf = genome_name + "_GTF_Annotation"
        gtf_file = script_util.check_and_download_existing_handle_obj(
            logger, ws_client, self.urls, params['ws_id'], ws_gtf,
            "KBaseRNASeq.GFFAnnotation", stringtie_dir, token)
        if gtf_file is None:
            rnaseq_util.create_gtf_annotation_from_genome(
                logger, ws_client, hs, self.urls, params['ws_id'], genome_id,
                genome_name, stringtie_dir, token)
        gtf_info = ws_client.get_object_info_new(
            {"objects": [{
                'name': ws_gtf,
                'workspace': params['ws_id']
            }]})[0]
        gtf_id = str(gtf_info[6]) + '/' + str(gtf_info[0]) + '/' + str(
            gtf_info[4])
        self.tool_opts = {
            k: str(v)
            for k, v in params.iteritems()
            if not k in ('ws_id', 'alignmentset_id',
                         'num_threads') and v is not None
        }
        alignment_ids = a_sampleset['data']['sample_alignments']
        m_alignment_names = a_sampleset['data']['mapped_rnaseq_alignments']
        self.sampleset_id = a_sampleset['data']['sampleset_id']
        ### Get List of Alignments Names
        self.align_names = []
        for d_align in m_alignment_names:
            for i, j in d_align.items():
                self.align_names.append(j)

        m_alignment_ids = a_sampleset['data']['mapped_alignments_ids']
        self.num_jobs = len(alignment_ids)
        if self.num_jobs < 2:
            raise ValueError(
                "Please ensure you have atleast 2 alignments to run Stringtie in Set mode"
            )

        logger.info(" Number of threads used by each process {0}".format(
            self.num_threads))
        count = 0
        for i in m_alignment_ids:
            for sample_name, alignment_id in i.items():
                task_param = {
                    'job_id': alignment_id,
                    'gtf_file': gtf_file,
                    'ws_id': params['ws_id'],
                    'genome_id': genome_id,
                    'stringtie_dir': self.directory,
                    'annotation_id': gtf_id,
                    'sample_id': sample_name,
                    'alignmentset_id': alignmentset_id
                }
                self.task_list.append(task_param)
                count = count + 1
                    script_util.move_files(logger,mv_dir,tophat_dir)
        except Exception, e:
               logger.error("".join(traceback.format_exc()))
               raise Exception("Unzip indexfile error")
        fasta_file =os.path.join(tophat_dir,(handler_util.get_file_with_suffix(tophat_dir,".fa")+".fa"))
        bowtie2base =os.path.join(tophat_dir,handler_util.get_file_with_suffix(tophat_dir,".rev.1.bt2"))

	### Check if GTF annotation object exist or skip this step
	### Check if the gtf object exists in the workspace
        ### Only run create_gtf_annotation if object doesnt exist
	ws_gtf = annotation_gtf+"_GTF_Annotation"

        genome_name = script_util.ws_get_obj_name( logger, ws_client, params['ws_id'], genome_id )
        gtf_file = script_util.check_and_download_existing_handle_obj(logger,ws_client,self.urls,params['ws_id'],ws_gtf,"KBaseRNASeq.GFFAnnotation",tophat_dir,token)
        if gtf_file is None:
            gtf_file = rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,genome_name,tophat_dir,token)
	#ret = script_util.if_obj_exists(None,ws_client,params['ws_id'],"KBaseRNASeq.GFFAnnotation",[ws_gtf]) # this line should be safe from reference
        #if not ret is None:
        #    logger.info("GFF Annotation Exist for Genome Annotation {0}.... Skipping step ".format(annotation_gtf))
	#    annot_name,annot_id = ret[0]
        #    gtf_obj=ws_client.get_objects([{'ref' : annot_id}])[0]
        #    gtf_id=gtf_obj['data']['handle']['id']
        #    gtf_name=gtf_obj['data']['handle']['file_name']
        #    try:
        #       script_util.download_file_from_shock(logger, shock_service_url=self.urls['shock_service_url'], shock_id=gtf_id,filename=gtf_name, directory=tophat_dir,token=token)
        #       gtf_file = os.path.join(tophat_dir,gtf_name)
        #    except Exception,e:
	#       logger.exception(e)
        #       raise Exception( "Unable to download shock file, {0}".format(gtf_name))  
 	#else:		
	#    gtf_file =rnaseq_util.create_gtf_annotation_from_genome(logger,ws_client,hs,self.urls,params['ws_id'],genome_id,annotation_gtf,tophat_dir,token)		
Exemple #6
0
class StringTieSample(StringTie):
    def __init__(self, logger, directory, urls, max_cores):
        super(self.__class__, self).__init__(logger, directory, urls,
                                             max_cores)
        # user defined shared variables across methods
        self.alignment_info = None
        #self.sampleset_info = None
        self.num_threads = 1

    def prepare(self):
        # for quick testing, we recover parameters here
        ws_client = self.common_params['ws_client']
        hs = self.common_params['hs_client']
        params = self.method_params
        logger = self.logger
        token = self.common_params['user_token']
        stringtie_dir = self.directory
        try:
            a_sample = ws_client.get_objects([{
                'name': params['alignmentset_id'],
                'workspace': params['ws_id']
            }])[0]
            a_alignment_info = ws_client.get_object_info_new({
                "objects": [{
                    'name': params['alignmentset_id'],
                    'workspace': params['ws_id']
                }]
            })[0]
            self.alignment_info = a_alignment_info
            s_alignment_id = str(a_alignment_info[6]) + '/' + str(
                a_alignment_info[0]) + '/' + str(a_alignment_info[4])
        except Exception, e:
            logger.exception("".join(traceback.format_exc()))
            raise Exception("Error Downloading objects from the workspace ")
        read_sample_id = a_sample['data']['read_sample_id']
        ### Check if the gtf file exists in the workspace. if exists download the file from that
        genome_id = a_sample['data']['genome_id']
        genome_name = ws_client.get_object_info([{
            "ref": genome_id
        }],
                                                includeMetadata=None)[0][1]
        ws_gtf = genome_name + "_GTF_Annotation"
        gtf_file = script_util.check_and_download_existing_handle_obj(
            logger, ws_client, self.urls, params['ws_id'], ws_gtf,
            "KBaseRNASeq.GFFAnnotation", stringtie_dir, token)
        if gtf_file is None:
            rnaseq_util.create_gtf_annotation_from_genome(
                logger, ws_client, hs, self.urls, params['ws_id'], genome_id,
                genome_name, stringtie_dir, token)
        gtf_info = ws_client.get_object_info_new(
            {"objects": [{
                'name': ws_gtf,
                'workspace': params['ws_id']
            }]})[0]
        gtf_id = str(gtf_info[6]) + '/' + str(gtf_info[0]) + '/' + str(
            gtf_info[4])
        self.tool_opts = {
            k: str(v)
            for k, v in params.iteritems()
            if not k in ('ws_id', 'alignmentset_id',
                         'num_threads') and v is not None
        }
        self.num_jobs = 1
        logger.info(" Number of threads used by each process {0}".format(
            self.num_threads))
        task_param = {
            'job_id': s_alignment_id,
            'gtf_file': gtf_file,
            'ws_id': params['ws_id'],
            'genome_id': genome_id,
            'stringtie_dir': self.directory,
            'annotation_id': gtf_id,
            'sample_id': read_sample_id,
            'alignmentset_id': None
        }
        self.task_list.append(task_param)