Exemple #1
0
def get_obj_info(logger,ws_url,objects,ws_id,token):
    """
    function to get the workspace object id from a object name
    """
    ret = []
    ws_client=Workspace(url=ws_url, token=token)
    for obj in  objects:
    	try:
            obj_infos = ws_client.get_object_info_new({"objects": [{'name': obj, 'workspace': ws_id}]})
            ret.append("{0}/{1}/{2}".format(obj_infos[0][6],obj_infos[0][0],obj_infos[0][4]))
        except Exception, e:
                     logger.error("Couldn't retrieve %s:%s from the workspace , %s " %(ws_id,obj,e))
    def export_genome_annotation_as_genbank(self, ctx, params):
        """
        A method designed especially for download, this calls 'genome_annotation_to_genbank' to do
        the work, but then packages the output with WS provenance and object info into
        a zip file and saves to shock.
        :param params: instance of type "ExportParams" -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_annotation_as_genbank

         # validate parameters
        if 'input_ref' not in params:
            raise ValueError('Cannot export GenomeAnnotation- not input_ref field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0]

        # export to a file
        file = self.genome_annotation_to_genbank(ctx, { 
                            'genome_ref': params['input_ref'], 
                            'new_genbank_file_name': info[1]+'.gbk' })[0]

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(file['path'], os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        dfUtil = DataFileUtil(self.callback_url)
        package_details = dfUtil.package_for_download({
                                    'file_path': export_package_dir,
                                    'ws_refs': [ params['input_ref'] ]
                                })

        output = { 'shock_id': package_details['shock_id'] }

        #END export_genome_annotation_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_annotation_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def CreateRNASeqSampleSet(self, ctx, params):
        """
        :param params: instance of type "CreateRNASeqSampleSetParams"
           (FUNCTIONS used in the service) -> structure: parameter "ws_id" of
           String, parameter "sampleset_id" of String, parameter
           "sampleset_desc" of String, parameter "domain" of String,
           parameter "platform" of String, parameter "sample_ids" of list of
           String, parameter "condition" of list of String, parameter
           "source" of String, parameter "Library_type" of String, parameter
           "publication_id" of String, parameter "external_source_date" of
           String
        :returns: instance of type "RNASeqSampleSet" (Object to Describe the
           RNASeq SampleSet @optional platform num_replicates source
           publication_Id external_source_date sample_ids @metadata ws
           sampleset_id @metadata ws platform @metadata ws num_samples
           @metadata ws num_replicates @metadata ws length(condition)) ->
           structure: parameter "sampleset_id" of String, parameter
           "sampleset_desc" of String, parameter "domain" of String,
           parameter "platform" of String, parameter "num_samples" of Long,
           parameter "num_replicates" of Long, parameter "sample_ids" of list
           of String, parameter "condition" of list of String, parameter
           "source" of String, parameter "Library_type" of String, parameter
           "publication_Id" of String, parameter "external_source_date" of
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN CreateRNASeqSampleSet
	
	user_token=ctx['token']
        ws_client=Workspace(url=self.__WS_URL, token=user_token)
	hs = HandleService(url=self.__HS_URL, token=user_token)
	try:
	    ### Create the working dir for the method; change it to a function call
	    out_obj = { k:v for k,v in params.iteritems() if not k in ('ws_id')}  	
	    sample_ids = params["sample_ids"]
	    out_obj['num_samples'] = len(sample_ids)
	    ## Validation to check if the Set contains more than one samples
	    if len(sample_ids) < 2:
		raise ValueError("This methods can only take 2 or more RNASeq Samples. If you have only one read sample, run either 'Align Reads using Tophat/Bowtie2' methods directly for getting alignment")

	    ## Validation to Check if the number of samples is equal to number of condition
	    if len(params["condition"]) != out_obj['num_samples']:
		raise ValueError("Please specify a treatment label for each sample in the RNA-seq SampleSet. Please enter the same label for the replicates in a sample type")
	    ## Validation to Check if the user is loading the same type as specified above
	    if params["Library_type"] == 'PairedEnd' : lib_type = 'KBaseAssembly.PairedEndLibrary'
	    else: lib_type = 'KBaseAssembly.SingleEndLibrary'
	    for i in sample_ids:
	    	s_info = ws_client.get_object_info_new({"objects": [{'name': i, 'workspace': params['ws_id']}]})
                obj_type = s_info[0][2].split('-')[0]
		if obj_type != lib_type:
			raise ValueError("Library_type mentioned : {0}. Please add only {1} typed objects in Reads fields".format(lib_type,lib_type)) 
	
   	    ## Code to Update the Provenance; make it a function later
            provenance = [{}]
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            #add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects']=[ params['ws_id']+'/'+sample for sample in sample_ids]
	    
	    #Saving RNASeqSampleSet to Workspace
	    self.__LOGGER.info("Saving {0} object to workspace".format(params['sampleset_id']))
	    res= ws_client.save_objects(
                                {"workspace":params['ws_id'],
                                 "objects": [{
                                                "type":"KBaseRNASeq.RNASeqSampleSet",
                                                "data":out_obj,
                                                "name":out_obj['sampleset_id'],
						"provenance": provenance}]
                                })
            returnVal = out_obj
        except Exception,e:
                raise KBaseRNASeqException("Error Saving the object to workspace {0},{1}".format(out_obj['sampleset_id'],"".join(traceback.format_exc())))
    def filter_genes(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN filter_genes
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y']
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
        if 'p_value' not in param and 'num_features' not in param:
          self.logger.error("One of p_value or num_features must be defined");
          return empty_results("One of p_value or num_features must be defined", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## Header correction
        try:
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'r') as ff:
                fe = ff.readlines()
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'w') as ff:
                ff.write(fl) # use original first line that has correct header information
                fe.pop(0)
                ff.writelines(fe)
        except:
            self.logger.error("Output was not found");
            return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
            
        
        ## checking genelist
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh:
          gl = glh.readlines()
        gl = [x.strip('\n') for x in gl]
 
        if(len(gl) < 1) :
          self.logger.error("No genes are selected")
          return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(4)
 
        ## Upload FVE
        # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
        # Updates: change missing genome handling strategy by copying reference to working workspace
        cmd_upload_expr = [self.TSV_2_FVE, '--workspace_service_url', self.__WS_URL, 
                                          '--object_name', param['out_expr_object_name'],
                                          '--working_directory', self.FINAL_DIR,
                                          '--input_directory', self.FLTRD_DIR,
                                          '--output_file_name', self.FINAL_FN
                              ]
        tmp_ws = param['workspace_name']
        if 'genome_ref' in expr:
            obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]
 
            if len(obj_infos) < 1:
                self.logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
                raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))
 
            #tmp_ws = "{0}".format(obj_infos[7])
            self.logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1]))
            if obj_infos[7] != param['workspace_name']:
                #we need to copy it from the other workspace
                try:
                  self.logger.info("trying to copy the referenced genome object : {0}".format(expr['genome_ref']))
                  ws.copy_object({'from' : {'ref' : expr['genome_ref']},'to' : {'workspace': param['workspace_name'], 'name' : obj_infos[1]}})
                  # add genome_object_name only after successful copy
                  cmd_upload_expr.append('--genome_object_name')
                  cmd_upload_expr.append(obj_infos[1])
                except:
                  # no permission or any issues... then, give up providing genome reference
                  self.logger.info("".join(traceback.format_exc()))
                  pass
            else:
                # it is local... we can simply add reference without copying genome
                cmd_upload_expr.append('--genome_object_name')
                cmd_upload_expr.append(obj_infos[1])
 
        # updated ws name
        cmd_upload_expr.append('--workspace_name')
        cmd_upload_expr.append(tmp_ws)
 
        self.logger.info(" ".join(cmd_upload_expr))
 
        tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        
        with open("{0}/{1}".format(self.FINAL_DIR,self.FINAL_FN),'r') as et:
          eo = json.load(et)
 
        if 'description' not in expr: 
            expr['description'] = "Filtered Expression Matrix"
        expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method'])
 
        if 'feature_mapping' in expr and 'feature_mapping' in eo:
            expr['feature_mapping'] = eo['feature_mapping']
        expr['data'] = eo['data']
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                              'data' : expr,
                                                                              'name' : (param['out_expr_object_name'])}]})
 
        ## Upload FeatureSet
        fs ={'elements': {}}
        fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method'])
 
        fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name'])
 
        for g in gl:
          if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
          else:
            fs['elements'][g] = []
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                              'data' : fs,
                                                                              'name' : (param['out_fs_object_name'])}]})
        result = {'workspace_name' : param['workspace_name'], 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']}
        #END filter_genes

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method filter_genes return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_as_genbank- no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_genbank_params = {'genome_ref': params['input_ref']}

        # export to file
        result = self.genome_to_genbank(
            ctx, genome_to_genbank_params)[0]['genbank_file']

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(export_package_dir,
                         os.path.basename(result['file_path'])))

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
 def index_genomes(self, ctx, params):
     # ctx is the context object
     # return variables are: returnVal
     #BEGIN index_genomes
     user_token=ctx['token']
     svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
     ws_client=Workspace(url=self.__WS_URL, token=user_token)
     hs = HandleService(url=self.__HS_URL, token=user_token)
     gs = {'elements' : {}}
     try:
         self.__LOGGER.info( "Preparing Target FA")
      
         blast_dir =self.__BLAST_DIR
         if os.path.exists(blast_dir):
             files=glob.glob("%s/*" % blast_dir)
             for f in files: os.remove(f)
         if not os.path.exists(blast_dir): os.makedirs(blast_dir)
       
      
            
         target_nt_fn = "%s/%s_nt.fa" %( blast_dir, params['blastindex_name'])
         target_aa_fn = "%s/%s_aa.fa" %( blast_dir, params['blastindex_name'])
      
         try:
           target_nt=open(target_nt_fn,'w')
           target_aa=open(target_aa_fn,'w')
         except:
           self.__LOGGER.error("Couldn't open file")
           raise KBaseGenomeUtilException("Backend awe client error: Couldn't open files")
      
         have_nt_seq = False
         have_aa_seq = False
      
      
      
         # Iterate one at a time to cope with main memory limit for euk genomes
         for genome_id in params['genome_ids']: 
      
             try:
                 obj_infos = ws_client.get_object_info_new({"objects": [{'name':genome_id, # replace `0' with loop
                                                            'workspace': params['ws_id']}]})
             except:
                 self.__LOGGER.error("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                 raise KBaseGenomeUtilException("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                  
      
             if len(obj_infos) < 1:
                 self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id))
                 continue
                 #err_msg += "Workspace error: Couldn't find %s:%s from the workspace\n" %(params['ws_id'],genome_id)                
                 # we can continue due to multiple genomes
                 #raise Exception("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id)) 
      
             ref_id = "{0}/{1}/{2}".format(obj_infos[0][6],obj_infos[0][0],obj_infos[0][4])
             gs['elements'][genome_id] = [ref_id]
            
             self.__LOGGER.info( "Downloading genome object from workspace {0}".format(ref_id))
            
             # TODO: make the following procedures to be loop for each genome_ids 
             try:
                 genome_list=ws_client.get_object_subset([{'name':genome_id, # replace `0' with loop
                                                           'workspace': params['ws_id'], 
                                                           'included':['features']}])
                 #genome_list=ws_client.get_objects([{'name':genome_id, # replace `0' with loop
                 #                                          'workspace': params['ws_id']}])
                 genome = genome_list[0]
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to download genome object itself even though we got the object information")
  
            
            
             self.__LOGGER.info( "Dumping seq for %s" % genome_id)
             # Dump genome sequences
             check_seq=0
             #extract protein sequences from the genome object
             try:
                 for gene in genome['data']['features']:
                       #>kb.g.1234.CDS.1234#At1g3333 amalase...
                       function = "NA"
                       aliases = "NA"
                       if 'function' in gene: 
                           function = gene['function']
                       if 'aliases' in gene: aliases = ",".join(gene['aliases'])
                       if 'protein_translation' in gene:
                             target_aa.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['protein_translation']))
                             have_aa_seq = True
                       if 'dna_sequence' in gene:
                             target_nt.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['dna_sequence']))
                             have_nt_seq = True
             except Exception as e:
                 raise KBaseGenomeUtilException("Failed to dump target sequence for genome : %s" % genome_id)
         try:
             target_nt.close()
             target_aa.close()
         except Exception as e:
             raise KBaseGenomeUtilException("Failed to close sequence files")
             
             
            
         if not have_nt_seq :
             self.__LOGGER.info("The genome objects do not contain any dna sequences!")
         if not have_aa_seq :
             self.__LOGGER.info("The genome objects do not contain any amino acid sequences!")
      
         index_type = 'none'
            
         if have_nt_seq :
             try:
                 cmdstring="%s -i %s -p F" %(self.__INDEX_CMD, target_nt_fn)
                 # TODO: replace it to subprocess.Popen
                 tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
                 stdout, stderr = tool_process.communicate()
                 
                 if stdout is not None and len(stdout) > 0:
                     self.__LOGGER.info(stdout)
                 
                 if stderr is not None and len(stderr) > 0:
                     self.__LOGGER.error("Indexing error: " + stderr)
                     raise KBaseGenomeUtilException("Indexing error: " + stderr)
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to run indexing program (%s) : %s " %(self.__INDEX_CMD, e))
                
             index_type = 'nucleotide'
 def index_genomes(self, ctx, params):
     # ctx is the context object
     # return variables are: returnVal
     #BEGIN index_genomes
     user_token=ctx['token']
     svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
     ws_client=Workspace(url=self.__WS_URL, token=user_token)
     hs = HandleService(url=self.__HS_URL, token=user_token)
     gs = {'elements' : {}}
     try:
         self.__LOGGER.info( "Preparing Target FA")
      
         blast_dir =self.__BLAST_DIR
         if os.path.exists(blast_dir):
             files=glob.glob("%s/*" % blast_dir)
             for f in files: os.remove(f)
         if not os.path.exists(blast_dir): os.makedirs(blast_dir)
       
      
            
         target_nt_fn = "%s/%s_nt.fa" %( blast_dir, params['blastindex_name'])
         target_aa_fn = "%s/%s_aa.fa" %( blast_dir, params['blastindex_name'])
      
         try:
           target_nt=open(target_nt_fn,'w')
           target_aa=open(target_aa_fn,'w')
         except:
           self.__LOGGER.error("Couldn't open file")
           raise KBaseGenomeUtilException("Backend awe client error: Couldn't open files")
      
         have_nt_seq = False
         have_aa_seq = False
      
      
      
         # Iterate one at a time to cope with main memory limit for euk genomes
         for genome_id in params['genome_ids']: 
      
             try:
                 obj_infos = ws_client.get_object_info_new({"objects": [{'name':genome_id, # replace `0' with loop
                                                            'workspace': params['ws_id']}]})
             except:
                 self.__LOGGER.error("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                 raise KBaseGenomeUtilException("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                  
      
             if len(obj_infos) < 1:
                 self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id))
                 continue
                 #err_msg += "Workspace error: Couldn't find %s:%s from the workspace\n" %(params['ws_id'],genome_id)                
                 # we can continue due to multiple genomes
                 #raise Exception("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id)) 
      
             ref_id = "{0}/{1}/{2}".format(obj_infos[0][6],obj_infos[0][0],obj_infos[0][4])
             gs['elements'][genome_id] = [ref_id]
            
             self.__LOGGER.info( "Downloading genome object from workspace {0}".format(ref_id))
            
             # TODO: make the following procedures to be loop for each genome_ids 
             try:
                 genome_list=ws_client.get_object_subset([{'name':genome_id, # replace `0' with loop
                                                           'workspace': params['ws_id'], 
                                                           'included':['features']}])
                 #genome_list=ws_client.get_objects([{'name':genome_id, # replace `0' with loop
                 #                                          'workspace': params['ws_id']}])
                 genome = genome_list[0]
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to download genome object itself even though we got the object information")
  
            
            
             self.__LOGGER.info( "Dumping seq for %s" % genome_id)
             # Dump genome sequences
             check_seq=0
             #extract protein sequences from the genome object
             try:
                 for gene in genome['data']['features']:
                       #>kb.g.1234.CDS.1234#At1g3333 amalase...
                       function = "NA"
                       aliases = "NA"
                       if 'function' in gene: 
                           function = gene['function']
                       if 'aliases' in gene: aliases = ",".join(gene['aliases'])
                       if 'protein_translation' in gene:
                             target_aa.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['protein_translation']))
                             have_aa_seq = True
                       if 'dna_sequence' in gene:
                             target_nt.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['dna_sequence']))
                             have_nt_seq = True
             except Exception as e:
                 raise KBaseGenomeUtilException("Failed to dump target sequence for genome : %s" % genome_id)
         try:
             target_nt.close()
             target_aa.close()
         except Exception as e:
             raise KBaseGenomeUtilException("Failed to close sequence files")
             
             
            
         if not have_nt_seq :
             self.__LOGGER.info("The genome objects do not contain any dna sequences!")
         if not have_aa_seq :
             self.__LOGGER.info("The genome objects do not contain any amino acid sequences!")
      
         index_type = 'none'
            
         if have_nt_seq :
             try:
                 cmdstring="%s -i %s -p F" %(self.__INDEX_CMD, target_nt_fn)
                 # TODO: replace it to subprocess.Popen
                 tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
                 stdout, stderr = tool_process.communicate()
                 
                 if stdout is not None and len(stdout) > 0:
                     self.__LOGGER.info(stdout)
                 
                 if stderr is not None and len(stderr) > 0:
                     self.__LOGGER.error("Indexing error: " + stderr)
                     raise KBaseGenomeUtilException("Indexing error: " + stderr)
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to run indexing program (%s) : %s " %(self.__INDEX_CMD, e))
                
             index_type = 'nucleotide'
def run_filter_genes(workspace_service_url=None, param_file = None, level=logging.INFO, logger = None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """ 

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
      param = json.load(paramh)

    cmd_dowload_cvt_tsv = [FVE_2_TSV, '--workspace_service_url', workspace_service_url, 
                                      '--workspace_name', param['workspace_name'],
                                      '--object_name', param['object_name'],
                                      '--working_directory', RAWEXPR_DIR,
                                      '--output_file_name', EXPRESS_FN
                          ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
      fl = f.readline()
    ncol = len(fl.split('\t'))
    
    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
      s.write("0")
      for j in range(1,ncol-1):
        s.write("\t{0}".format(j))
      s.write("\n")


    ## Run coex_filter
    cmd_coex_filter = [COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN),
                       '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN),
                       '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y']
    if 'num_features' in param:
      cmd_coex_filter.append("-n")
      cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
      cmd_coex_filter.append("-p")
      cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
      logger.error("One of p_value or num_features must be defined");
      sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(fl) # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)
    

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
    
    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [TSV_2_FVE, '--workspace_service_url', workspace_service_url, 
                                      '--object_name', param['out_expr_object_name'],
                                      '--working_directory', FINAL_DIR,
                                      '--input_directory', FLTRD_DIR,
                                      '--output_file_name', FINAL_FN
                          ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    
    with open("{0}/{1}".format(FINAL_DIR,FINAL_FN),'r') as et:
      eo = json.load(et)

    if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format(expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                          'data' : expr,
                                                                          'name' : (param['out_expr_object_name'])}]})

    ## Upload FeatureSet
    fs ={'description':'Differentially expressed genes generated by {0}'.format(" ".join(cmd_coex_filter)),
         'elements': {}}
    
    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN),'r') as glh:
      gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
      if 'genome_ref' in expr:
        fs['elements'][g] = [expr['genome_ref']]
      else:
        fs['elements'][g] = []

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                          'data' : fs,
                                                                          'name' : (param['out_fs_object_name'])}]})
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError('Cannot run export_genome_as_genbank- no "input_ref" field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0]

        genome_to_genbank_params = {
            'genome_ref': params['input_ref']
        }

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(ctx, genome_to_genbank_params)[0]['genbank_file'];

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
          result['file_path'],
          os.path.join(export_package_dir, os.path.basename(result['file_path'])))

        # export original uploaded GenBank file if it existed.
        exporter = GenomeToGenbank(self.cfg)
        original_result_full = exporter.export_original_genbank(ctx, genome_to_genbank_params)
        if original_result_full is not None:
            original_result = original_result_full['genbank_file']
            shutil.move(
              original_result['file_path'],
              os.path.join(export_package_dir, os.path.basename(original_result['file_path'])))

        # Make warning file about genes only.
        warning_filename = "warning.txt"
        with open(os.path.join(export_package_dir, warning_filename), 'wb') as temp_file:
            temp_file.write('Please note: the KBase-derived GenBank file for annotated genome ' +
                            'objects currently only shows "gene" features. CDS and mRNA ' +
                            'feature types are not currently included in the GenBank download, ' +
                            'but are in the KBase Genome object. ' +
                            'We hope to address this issue in the future.\n\n' +
                            'This directory includes the KBase-derived GenBank file and also ' +
                            '(if you originally uploaded the genome from an annotated ' +
                            'GenBank file) the original GenBank input.')

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
                                    'file_path': export_package_dir,
                                    'ws_refs': [ params['input_ref'] ]
                                })

        output = { 'shock_id': package_details['shock_id'] }

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def genbank_to_genome_annotation(self, ctx, params):
        """
        :param params: instance of type "GenbankToGenomeAnnotationParams"
           (file_path or shock_id -- Local path or shock_id of the uploaded
           file with genome sequence in GenBank format or zip-file with
           GenBank files. genome_name -- The name you would like to use to
           reference this GenomeAnnotation. If not supplied, will use the
           Taxon Id and the data source to determine the name. taxon_wsname -
           name of the workspace containing the Taxonomy data, defaults to
           'ReferenceTaxons') -> structure: parameter "file_path" of String,
           parameter "shock_id" of String, parameter "ftp_url" of String,
           parameter "genome_name" of String, parameter "workspace_name" of
           String, parameter "source" of String, parameter "taxon_wsname" of
           String, parameter "convert_to_legacy" of type "boolean" (A boolean
           - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenomeAnnotationDetails" -> structure:
           parameter "genome_annotation_ref" of String
        """
        # ctx is the context object
        # return variables are: details
        #BEGIN genbank_to_genome_annotation

        print('genbank_to_genome_annotation -- paramaters = ')
        pprint(params)

        # validate input and set defaults.  Note that because we don't call the uploader method
        # as a stand alone script, we do the validation here.
        if 'workspace_name' not in params:
            raise ValueError('workspace_name field was not defined')
        workspace_name = params['workspace_name']

        if 'genome_name' not in params:
            raise ValueError('genome_name field was not defined')
        genome_name = params['genome_name']

        source = 'Genbank'
        if 'source' in params:
            source = source;

        taxon_wsname = 'ReferenceTaxons'
        if 'taxon_wsname' in params:
            taxon_wsname = params['taxon_wsname']

        # other options to handle
        # release
        # taxon_reference
        # exclude_feature_types
        # type


        # construct the input directory where we stage files
        input_directory =  os.path.join(self.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4()))
        os.makedirs(input_directory)

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory

        genbank_file_path = None

        if 'file_path' not in params:
            if 'shock_id' not in params:
                if 'ftp_url' not in params:
                    raise ValueError('No input file (either file_path, shock_id, or ftp_url) provided')
                else:
                    # TODO handle ftp - this creates a directory for us, so update the input directory
                    print('calling Transform download utility: script_utils.download');
                    print('URL provided = '+params['ftp_url']);
                    script_utils.download_from_urls(
                            working_directory = input_directory,
                            token = ctx['token'], # not sure why this requires a token to download from a url...
                            urls  = {
                                        'ftpfiles': params['ftp_url']
                                    }
                        );
                    input_directory = os.path.join(input_directory,'ftpfiles')
                    # unpack everything in input directory
                    dir_contents = os.listdir(input_directory)
                    print('downloaded directory listing:')
                    pprint(dir_contents)
                    dir_files = []
                    for f in dir_contents:
                        if os.path.isfile(os.path.join(input_directory, f)):
                            dir_files.append(f)

                    print('processing files in directory...')
                    for f in dir_files:
                        # unpack if needed using the standard transform utility
                        print('unpacking '+f)
                        script_utils.extract_data(filePath=os.path.join(input_directory,f))

            else:
                # handle shock file
                dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
                file_name = dfUtil.shock_to_file({
                                    'file_path': input_directory,
                                    'shock_id': params['shock_id']
                                })['node_file_name']
                genbank_file_path = os.path.join(input_directory, file_name)
        else:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = params['file_path']
            genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if genbank_file_path is not None:
            print("input genbank file =" + genbank_file_path)

            # unpack if needed using the standard transform utility
            script_utils.extract_data(filePath=genbank_file_path)

        # do the upload (doesn't seem to return any information)
        uploader.upload_genome(
                logger=None,

                shock_service_url = self.shockURL,
                handle_service_url = self.handleURL,
                workspace_service_url = self.workspaceURL,

                input_directory=input_directory,

                workspace_name   = workspace_name,
                core_genome_name = genome_name,
                source           = source,
                taxon_wsname     = taxon_wsname
            )

        #### Code to convert to legacy type if requested
        if 'convert_to_legacy' in params and params['convert_to_legacy']==1:
            from doekbase.data_api.converters import genome as cvt
            print('Converting to legacy type, object={}'.format(genome_name))
            cvt.convert_genome(
                    shock_url=self.shockURL,
                    handle_url=self.handleURL,
                    ws_url=self.workspaceURL,
                    obj_name=genome_name,
                    ws_name=workspace_name)

        # clear the temp directory
        shutil.rmtree(input_directory)

        # get WS metadata to return the reference to the object (could be returned by the uploader method...)
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':workspace_name + '/' + genome_name}],'includeMetadata':0, 'ignoreErrors':0})[0]

        details = {
            'genome_annotation_ref':str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        }


        #END genbank_to_genome_annotation

        # At some point might do deeper type checking...
        if not isinstance(details, dict):
            raise ValueError('Method genbank_to_genome_annotation return value ' +
                             'details is not type dict as required.')
        # return the results
        return [details]
    def genome_annotation_to_genbank(self, ctx, params):
        """
        :param params: instance of type "GenomeAnnotationToGenbankParams"
           (genome_ref -- Reference to the GenomeAnnotation or Genome object
           in KBase in any ws supported format OR genome_name +
           workspace_name -- specifiy the genome name and workspace name of
           what you want.  If genome_ref is defined, these args are ignored.
           new_genbank_file_name -- specify the output name of the genbank
           file, optional save_to_shock -- set to 1 or 0, if 1 then output is
           saved to shock. default is zero) -> structure: parameter
           "genome_ref" of String, parameter "genome_name" of String,
           parameter "workspace_name" of String, parameter
           "new_genbank_file_name" of String, parameter "save_to_shock" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenbankFile" -> structure: parameter
           "path" of String, parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN genome_annotation_to_genbank

        print('genome_annotation_to_genbank -- paramaters = ')
        pprint(params)

        service_endpoints = {
            "workspace_service_url": self.workspaceURL, 
            "shock_service_url": self.shockURL,
            "handle_service_url": self.handleURL
        }

        # parse/validate parameters.  could do a better job here.
        genome_ref = None
        if 'genome_ref' in params and params['genome_ref'] is not None:
            genome_ref = params['genome_ref']
        else:
            if 'genome_name' not in params:
                raise ValueError('genome_ref and genome_name are not defined.  One of those is required.')
            if 'workspace_name' not in params:
                raise ValueError('workspace_name is not defined.  This is required if genome_name is specified' +
                    ' without a genome_ref')
            genome_ref = params['workspace_name'] + '/' + params['genome_name']

        # do a quick lookup of object info- could use this to do some validation.  Here we need it to provide
        # a nice output file name if it is not set...  We should probably catch errors here and print out a nice
        # message - usually this would mean the ref was bad.
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':genome_ref}],'includeMetadata':0, 'ignoreErrors':0})[0]
        print('resolved object to:');
        pprint(info)

        if 'new_genbank_file_name' not in params or params['new_genbank_file_name'] is None:
            new_genbank_file_name = info[1] + ".gbk"
        else:
            new_genbank_file_name = params['new_genbank_file_name']


        # construct a working directory to hand off to the data_api
        working_directory =  os.path.join(self.sharedFolder, 'genome-download-'+str(uuid.uuid4()))
        os.makedirs(working_directory)
        output_file_destination = os.path.join(working_directory,new_genbank_file_name)

        # do it
        print('calling: doekbase.data_api.downloaders.GenomeAnnotation.downloadAsGBK');
        GenomeAnnotation.downloadAsGBK(
                            genome_ref,
                            service_endpoints,
                            ctx['token'],
                            output_file_destination,
                            working_directory)

        # if we need to upload to shock, well then do that too.
        file = {}
        if 'save_to_shock' in params and params['save_to_shock'] == 1:
            dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
            file['shock_id'] =dfUtil.file_to_shock({
                                    'file_path':output_file_destination,
                                    'gzip':0,
                                    'make_handle':0
                                    #attributes: {} #we can set shock attributes if we want
                                })['shock_id']
        else:
            file['path'] = output_file_destination

        #END genome_annotation_to_genbank

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method genome_annotation_to_genbank return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]
Exemple #12
0
        else:
            logger.info("Workspace objects transformed to {0}".format(external_type))

        #
        # TODO validation of data files after transform
        #

        # Step 2: Extract provenance and metadata
        try:    
            workspaceClient = Workspace(url=workspace_service_url, token=kb_token)
        
            object_info = {"workspace": workspace_name, "name": object_name}

            object_details = dict()
            object_details["provenance"] = workspaceClient.get_object_provenance([object_info])
            object_details["metadata"] = workspaceClient.get_object_info_new({"objects":[object_info], "includeMetadata":1})
            
            #logger.debug(object_details["metadata"])
            
            # redundant information
            #object_details["references"] = workspaceClient.list_referencing_objects([object_info])

            # seems like maybe too crazy per download
            #object_details["history"] = workspaceClient.get_object_history(object_info)

            object_version = object_details["metadata"][0][4]
        
            object_metadata_filename = "KBase_object_details_{0}_{1}_v{2}.json".format(workspace_name, object_name, object_version)
            file_name = os.path.join(transform_directory, object_metadata_filename)
        
            with open(file_name, 'w') as f:
def run_filter_genes(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_filter
    cmd_coex_filter = [
        COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s',
        "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x',
        "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'
    ]
    if 'num_features' in param:
        cmd_coex_filter.append("-n")
        cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
        cmd_coex_filter.append("-p")
        cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
        logger.error("One of p_value or num_features must be defined")
        sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(
            fl)  # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [
        TSV_2_FVE, '--workspace_service_url', workspace_service_url,
        '--object_name', param['out_expr_object_name'], '--working_directory',
        FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name',
        FINAL_FN
    ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new(
            {"objects": [{
                'ref': expr['genome_ref']
            }]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws,
                                              obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et:
        eo = json.load(et)

    if 'description' in expr:
        expr['description'] = "{0}, coex_filter by {1}".format(
            expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.ExpressionMatrix',
            'data': expr,
            'name': (param['out_expr_object_name'])
        }]
    })

    ## Upload FeatureSet
    fs = {
        'description':
        'Differentially expressed genes generated by {0}'.format(
            " ".join(cmd_coex_filter)),
        'elements': {}
    }

    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh:
        gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
        if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
        else:
            fs['elements'][g] = []

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseCollections.FeatureSet',
            'data': fs,
            'name': (param['out_fs_object_name'])
        }]
    })
Exemple #14
0
def run_filter_genes(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    # force to use ANOVA if the number of sample is two
    if (ncol == 3): param['method'] = 'anova'

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_filter
    cmd_coex_filter = [
        COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s',
        "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x',
        "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'
    ]
    if 'num_features' in param:
        cmd_coex_filter.append("-n")
        cmd_coex_filter.append(str(param['num_features']))

    if 'p_value' in param:
        cmd_coex_filter.append("-p")
        cmd_coex_filter.append(str(param['p_value']))

    if 'p_value' not in param and 'num_features' not in param:
        logger.error("One of p_value or num_features must be defined")
        return empty_results("One of p_value or num_features must be defined",
                             expr, workspace_service_url, param, logger, ws)
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    try:
        with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
            fe = ff.readlines()
        with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
            ff.write(
                fl
            )  # use original first line that has correct header information
            fe.pop(0)
            ff.writelines(fe)
    except:
        logger.error("Output was not found")
        return empty_results("Increase p_value or specify num_features", expr,
                             workspace_service_url, param, logger, ws)

    ## checking genelist
    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh:
        gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    if (len(gl) < 1):
        logger.error("No genes are selected")
        return empty_results("Increase p_value or specify num_features", expr,
                             workspace_service_url, param, logger, ws)
        #sys.exit(4)

    ## Upload FVE
    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    # Updates: change missing genome handling strategy by copying reference to working workspace
    cmd_upload_expr = [
        TSV_2_FVE, '--workspace_service_url', workspace_service_url,
        '--object_name', param['out_expr_object_name'], '--working_directory',
        FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name',
        FINAL_FN
    ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        obj_infos = ws.get_object_info_new(
            {"objects": [{
                'ref': expr['genome_ref']
            }]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))

        #tmp_ws = "{0}".format(obj_infos[7])
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7],
                                              obj_infos[1]))
        if obj_infos[7] != param['workspace_name']:
            #we need to copy it from the other workspace
            try:
                logger.info(
                    "trying to copy the referenced genome object : {0}".format(
                        expr['genome_ref']))
                ws.copy_object({
                    'from': {
                        'ref': expr['genome_ref']
                    },
                    'to': {
                        'workspace': param['workspace_name'],
                        'name': obj_infos[1]
                    }
                })
                # add genome_object_name only after successful copy
                cmd_upload_expr.append('--genome_object_name')
                cmd_upload_expr.append(obj_infos[1])
            except:
                # no permission or any issues... then, give up providing genome reference
                logger.info("".join(traceback.format_exc()))
                pass
        else:
            # it is local... we can simply add reference without copying genome
            cmd_upload_expr.append('--genome_object_name')
            cmd_upload_expr.append(obj_infos[1])

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    logger.info(" ".join(cmd_upload_expr))

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et:
        eo = json.load(et)

    if 'description' not in expr:
        expr['description'] = "Filtered Expression Matrix"
    expr['description'] += " : Filtered by '{1}' method ".format(
        expr['description'], param['method'])

    if 'feature_mapping' in expr and 'feature_mapping' in eo:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.ExpressionMatrix',
            'data': expr,
            'name': (param['out_expr_object_name'])
        }]
    })

    ## Upload FeatureSet
    fs = {'elements': {}}
    fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(
        param['method'])

    fs['description'] += "from {0}/{1}".format(param['workspace_name'],
                                               param['object_name'])

    for g in gl:
        if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
        else:
            fs['elements'][g] = []

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseCollections.FeatureSet',
            'data': fs,
            'name': (param['out_fs_object_name'])
        }]
    })