def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        dfu = DataFileUtil(self.callback_url)

        bu = BackgroundUtils()
        TU = TestUtils()
        if params['TESTFLAG'] and params['background']:
            targetpath = '/kb/module/work/tmp/testgenome.fa'
            TU.GetGenome(targetpath)
            bu.BuildBackground(targetpath)
        elif params['background']:

            ws = Workspace('https://appdev.kbase.us/services/ws')
            subset = ws.get_object_subset([{
                                         'included':['/features/[*]/location', '/features/[*]/id','/assembly_ref'],
    'ref':params['genome_ref']}])
            aref = subset[0]['data']['assembly_ref']
            assembly_ref = {'ref': aref}
            print('Downloading Assembly data as a Fasta file.')
            assemblyUtil = AssemblyUtil(self.callback_url)
            fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)['path']
            bu.BuildBackground(fasta_file)


        get_objects_params = {'object_refs' : [params['SequenceSetRef']]}

        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']
        outFile = open(params['fasta_outpath'],'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()

        fu=FastaUtils()
        if params['mask_repeats']:
            fu.RemoveRepeats(params['fasta_outpath'],params['fasta_outpath'])

        output = {'fasta_outpath' : params['fasta_outpath']}
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def get_feature_ids(self, ctx, p):
        """
        :param p: instance of type "Params" (Insert your typespec information
           here.) -> structure: parameter "ref" of String
        :returns: instance of type "Output" -> structure:
        """
        # ctx is the context object
        # return variables are: o
        #BEGIN get_feature_ids
        ws = Workspace(self.workspaceURL, token = ctx['token'])

        featureContainers = ws.get_object_subset([{
                                     'included':['feature_container_references'], 
                                     'ref':p['ref']}])[0]['data']['feature_container_references']

        all_features = {}
        for fc in featureContainers:
            fc_ws_id = featureContainers[fc]
            features = ws.get_object_subset([{
                                     'included':['/features/*/feature_id'], 
                                     'ref':fc_ws_id}])[0]['data']['features']
            all_features[fc] = features.keys()
            #feature_list = []
            #for f in features:
            #    feature_list.append(features[f]['feature_id'])
            #all_features[fc] = feature_list

        o = all_features
        #END get_feature_ids

        # At some point might do deeper type checking...
        if not isinstance(o, dict):
            raise ValueError('Method get_feature_ids return value ' +
                             'o is not type dict as required.')
        # return the results
        return [o]
    def blast_against_genome(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN blast_against_genome

        # TODO: Rename blast_search

        try:
           self.__LOGGER.info( "Preparing FA")
           if len(params['query']) > 5:
               sequence=params['query']
           else:
               self.__LOGGER.error("The input sequence is too short!")
               raise KBaseGenomeUtilException("The input sequence is too short!")
        
           if not os.path.exists(self.__TEMP_DIR): os.makedirs(self.__TEMP_DIR)
         
           #print "generate input file for query sequence\n"
           query_fn = "%s/%s" %(self.__TEMP_DIR, self.__QUERY_FA)
           target=open(query_fn,'w')
           if sequence.startswith(">"):
             target.write(sequence)
           else:
             seqes = sequence.split("\n")
             for i in range(len(seqes)):
               target.write(">query_seq_%d\n" %(i))
               target.write(seqes[i])
           target.close()
         
           user_token=ctx['token']
           svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
           ws_client=Workspace(url=self.__WS_URL, token=user_token)
        
        
           err_msg = ""
        
           blast_dir =self.__BLAST_DIR
           if os.path.exists(blast_dir):
               files=glob.glob("%s/*" % blast_dir)
               for f in files: os.remove(f)
           if not os.path.exists(blast_dir): os.makedirs(blast_dir)
           target_fn = "%s/%s" %( blast_dir, self.__GENOME_FA)
           if 'target_seqs' in params:
               # let's build index directly and throw away
               sequence = params['target_seqs']
        
               target=open(target_fn,'w')
               if sequence.startswith(">"):
                 target.write(sequence)
               else:
                 seqes = sequence.split("\n")
                 for i in range(len(seqes)):
                   target.write(">target_seq_%d\n" %(i))
                   target.write(seqes[i])
               target.close()
            
               if(self.__INDEX_TYPE[params['blast_program']]  == 'protein_db'):
                   formatdb_type='T'
               elif(self.__INDEX_TYPE[params['blast_program']]  == 'transcript_db'):
                   formatdb_type='F'
               else:
                   self.__LOGGER.error("{0} is not yet supported".format(params['blast_program']))
                   raise KBaseGenomeUtilException("{0} is not yet supported".format(params['blast_program']))
               cmdstring="%s -i %s -p %s -o T" %(self.__INDEX_CMD, target_fn, formatdb_type)
               # TODO: replace it to subprocess.Popen
               tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
               stdout, stderr = tool_process.communicate()
   
               if stdout is not None and len(stdout) > 0:
                   self.__LOGGER.info(stdout)
   
               if stderr is not None and len(stderr) > 0:
                   self.__LOGGER.error("Index error: " + stderr)
                   raise KBaseGenomeUtilException("Index error: " + stderr)
        
           else:
               try:
                   blast_indexes=ws_client.get_object_subset([{'name':params['blastindex_name'],
                                                             'workspace': params['ws_id'], 
                                                             'included':['handle', 'index_type']}])
               except:
                   self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['blastindex_name']))
                   raise KBaseGenomeUtilException("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['genome_ids'][0]))
                   
               if len(blast_indexes) < 1:
                   self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['blastindex_name']))
                   raise KBaseGenomeUtilException("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['genome_ids'][0]))
        
               
               # TODO: Add err handling
               zip_fn = blast_indexes[0]['data']['handle']['file_name']
               target_fn = "%s/%s" %(blast_dir, zip_fn[:-4]) # remove '.zip'
        
               if(self.__INDEX_TYPE[params['blast_program']]  == 'protein_db'):
                   target_fn += '_aa.fa'
                   if blast_indexes[0]['data']['index_type'] == 'none' or blast_indexes[0]['data']['index_type'] == "nucleotide":
                       self.__LOGGER.error("The index object does not contain amino acid sequence indexes")
                       raise KBaseGenomeUtilException("The index object does not contain amino acid sequence indexes")                    
               elif(self.__INDEX_TYPE[params['blast_program']]  == 'transcript_db'):
                   target_fn += '_nt.fa'
                   if blast_indexes[0]['data']['index_type'] == 'none' or blast_indexes[0]['data']['index_type'] == "protein":
                       self.__LOGGER.error("The index object does not contain nucleotide sequence indexes")
                       raise KBaseGenomeUtilException("The index object does not contain nucleotide sequence indexes")                    
               else:
                   self.__LOGGER.error("{0} is not yet supported".format(params['blast_program']))
                   raise KBaseGenomeUtilException("{0} is not yet supported".format(params['blast_program']))
        
               # TODO: Add err handling
               zip_fn = blast_indexes[0]['data']['handle']['file_name']
               #pprint(blast_indexes[0])
              
               self.__LOGGER.info("Downloading the genome index")
               #hs = HandleService(url=self.__HS_URL, token=user_token)
               try:
                   script_util.download_file_from_shock(self.__LOGGER,
                                   shock_service_url= blast_indexes[0]['data']['handle']['url'],
                                   shock_id= blast_indexes[0]['data']['handle']['id'],
                                   filename= blast_indexes[0]['data']['handle']['file_name'],
                                   directory= '.',
                                   token = user_token)
               except Exception, e:
                   self.__LOGGER.error("Downloading error from shock: Please contact [email protected]")
                   raise KBaseGenomeUtilException("Downloading error from shock: Please contact [email protected]")
               try:
                   script_util.unzip_files(self.__LOGGER, zip_fn, blast_dir)
               except Exception, e:
                   self.__LOGGER.error("Unzip indexfile error: Please contact [email protected]")
                   raise KBaseGenomeUtilException("Unzip indexfile error: Please contact [email protected]")
 def index_genomes(self, ctx, params):
     # ctx is the context object
     # return variables are: returnVal
     #BEGIN index_genomes
     user_token=ctx['token']
     svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
     ws_client=Workspace(url=self.__WS_URL, token=user_token)
     hs = HandleService(url=self.__HS_URL, token=user_token)
     gs = {'elements' : {}}
     try:
         self.__LOGGER.info( "Preparing Target FA")
      
         blast_dir =self.__BLAST_DIR
         if os.path.exists(blast_dir):
             files=glob.glob("%s/*" % blast_dir)
             for f in files: os.remove(f)
         if not os.path.exists(blast_dir): os.makedirs(blast_dir)
       
      
            
         target_nt_fn = "%s/%s_nt.fa" %( blast_dir, params['blastindex_name'])
         target_aa_fn = "%s/%s_aa.fa" %( blast_dir, params['blastindex_name'])
      
         try:
           target_nt=open(target_nt_fn,'w')
           target_aa=open(target_aa_fn,'w')
         except:
           self.__LOGGER.error("Couldn't open file")
           raise KBaseGenomeUtilException("Backend awe client error: Couldn't open files")
      
         have_nt_seq = False
         have_aa_seq = False
      
      
      
         # Iterate one at a time to cope with main memory limit for euk genomes
         for genome_id in params['genome_ids']: 
      
             try:
                 obj_infos = ws_client.get_object_info_new({"objects": [{'name':genome_id, # replace `0' with loop
                                                            'workspace': params['ws_id']}]})
             except:
                 self.__LOGGER.error("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                 raise KBaseGenomeUtilException("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                  
      
             if len(obj_infos) < 1:
                 self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id))
                 continue
                 #err_msg += "Workspace error: Couldn't find %s:%s from the workspace\n" %(params['ws_id'],genome_id)                
                 # we can continue due to multiple genomes
                 #raise Exception("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id)) 
      
             ref_id = "{0}/{1}/{2}".format(obj_infos[0][6],obj_infos[0][0],obj_infos[0][4])
             gs['elements'][genome_id] = [ref_id]
            
             self.__LOGGER.info( "Downloading genome object from workspace {0}".format(ref_id))
            
             # TODO: make the following procedures to be loop for each genome_ids 
             try:
                 genome_list=ws_client.get_object_subset([{'name':genome_id, # replace `0' with loop
                                                           'workspace': params['ws_id'], 
                                                           'included':['features']}])
                 #genome_list=ws_client.get_objects([{'name':genome_id, # replace `0' with loop
                 #                                          'workspace': params['ws_id']}])
                 genome = genome_list[0]
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to download genome object itself even though we got the object information")
  
            
            
             self.__LOGGER.info( "Dumping seq for %s" % genome_id)
             # Dump genome sequences
             check_seq=0
             #extract protein sequences from the genome object
             try:
                 for gene in genome['data']['features']:
                       #>kb.g.1234.CDS.1234#At1g3333 amalase...
                       function = "NA"
                       aliases = "NA"
                       if 'function' in gene: 
                           function = gene['function']
                       if 'aliases' in gene: aliases = ",".join(gene['aliases'])
                       if 'protein_translation' in gene:
                             target_aa.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['protein_translation']))
                             have_aa_seq = True
                       if 'dna_sequence' in gene:
                             target_nt.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['dna_sequence']))
                             have_nt_seq = True
             except Exception as e:
                 raise KBaseGenomeUtilException("Failed to dump target sequence for genome : %s" % genome_id)
         try:
             target_nt.close()
             target_aa.close()
         except Exception as e:
             raise KBaseGenomeUtilException("Failed to close sequence files")
             
             
            
         if not have_nt_seq :
             self.__LOGGER.info("The genome objects do not contain any dna sequences!")
         if not have_aa_seq :
             self.__LOGGER.info("The genome objects do not contain any amino acid sequences!")
      
         index_type = 'none'
            
         if have_nt_seq :
             try:
                 cmdstring="%s -i %s -p F" %(self.__INDEX_CMD, target_nt_fn)
                 # TODO: replace it to subprocess.Popen
                 tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
                 stdout, stderr = tool_process.communicate()
                 
                 if stdout is not None and len(stdout) > 0:
                     self.__LOGGER.info(stdout)
                 
                 if stderr is not None and len(stderr) > 0:
                     self.__LOGGER.error("Indexing error: " + stderr)
                     raise KBaseGenomeUtilException("Indexing error: " + stderr)
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to run indexing program (%s) : %s " %(self.__INDEX_CMD, e))
                
             index_type = 'nucleotide'
    def blast_against_genome(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN blast_against_genome

        # TODO: Rename blast_search

        try:
           self.__LOGGER.info( "Preparing FA")
           if len(params['query']) > 5:
               sequence=params['query']
           else:
               self.__LOGGER.error("The input sequence is too short!")
               raise KBaseGenomeUtilException("The input sequence is too short!")
        
           if not os.path.exists(self.__TEMP_DIR): os.makedirs(self.__TEMP_DIR)
         
           #print "generate input file for query sequence\n"
           query_fn = "%s/%s" %(self.__TEMP_DIR, self.__QUERY_FA)
           target=open(query_fn,'w')
           if sequence.startswith(">"):
             target.write(sequence)
           else:
             seqes = sequence.split("\n")
             for i in range(len(seqes)):
               target.write(">query_seq_%d\n" %(i))
               target.write(seqes[i])
           target.close()
         
           user_token=ctx['token']
           svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
           ws_client=Workspace(url=self.__WS_URL, token=user_token)
        
        
           err_msg = ""
        
           blast_dir =self.__BLAST_DIR
           if os.path.exists(blast_dir):
               files=glob.glob("%s/*" % blast_dir)
               for f in files: os.remove(f)
           if not os.path.exists(blast_dir): os.makedirs(blast_dir)
           target_fn = "%s/%s" %( blast_dir, self.__GENOME_FA)
           if 'target_seqs' in params:
               # let's build index directly and throw away
               sequence = params['target_seqs']
        
               target=open(target_fn,'w')
               if sequence.startswith(">"):
                 target.write(sequence)
               else:
                 seqes = sequence.split("\n")
                 for i in range(len(seqes)):
                   target.write(">target_seq_%d\n" %(i))
                   target.write(seqes[i])
               target.close()
            
               if(self.__INDEX_TYPE[params['blast_program']]  == 'protein_db'):
                   formatdb_type='T'
               elif(self.__INDEX_TYPE[params['blast_program']]  == 'transcript_db'):
                   formatdb_type='F'
               else:
                   self.__LOGGER.error("{0} is not yet supported".format(params['blast_program']))
                   raise KBaseGenomeUtilException("{0} is not yet supported".format(params['blast_program']))
               cmdstring="%s -i %s -p %s -o T" %(self.__INDEX_CMD, target_fn, formatdb_type)
               # TODO: replace it to subprocess.Popen
               tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
               stdout, stderr = tool_process.communicate()
   
               if stdout is not None and len(stdout) > 0:
                   self.__LOGGER.info(stdout)
   
               if stderr is not None and len(stderr) > 0:
                   self.__LOGGER.error("Index error: " + stderr)
                   raise KBaseGenomeUtilException("Index error: " + stderr)
        
           else:
               try:
                   blast_indexes=ws_client.get_object_subset([{'name':params['blastindex_name'],
                                                             'workspace': params['ws_id'], 
                                                             'included':['handle', 'index_type']}])
               except:
                   self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['blastindex_name']))
                   raise KBaseGenomeUtilException("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['genome_ids'][0]))
                   
               if len(blast_indexes) < 1:
                   self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['blastindex_name']))
                   raise KBaseGenomeUtilException("Couldn't find %s:%s from the workspace" %(params['ws_id'],params['genome_ids'][0]))
        
               
               # TODO: Add err handling
               zip_fn = blast_indexes[0]['data']['handle']['file_name']
               target_fn = "%s/%s" %(blast_dir, zip_fn[:-4]) # remove '.zip'
        
               if(self.__INDEX_TYPE[params['blast_program']]  == 'protein_db'):
                   target_fn += '_aa.fa'
                   if blast_indexes[0]['data']['index_type'] == 'none' or blast_indexes[0]['data']['index_type'] == "nucleotide":
                       self.__LOGGER.error("The index object does not contain amino acid sequence indexes")
                       raise KBaseGenomeUtilException("The index object does not contain  amino acid sequence indexes. This index will only work with blastn (nucleotide query, nucleotide index), tblastx(protein query, nucleotide index) and tblastx(nucleotide query, nucleotide index)")
               elif(self.__INDEX_TYPE[params['blast_program']]  == 'transcript_db'):
                   target_fn += '_nt.fa'
                   if blast_indexes[0]['data']['index_type'] == 'none' or blast_indexes[0]['data']['index_type'] == "protein":
                       self.__LOGGER.error("The index object does not contain nucleotide sequence indexes")
                       raise KBaseGenomeUtilException("The index object does not contain nucleotide sequence indexes. This index will only work with blastp (protein query, protein index) and blastx(nucleotide query, protein index)")                    
               else:
                   self.__LOGGER.error("{0} is not yet supported".format(params['blast_program']))
                   raise KBaseGenomeUtilException("{0} is not yet supported".format(params['blast_program']))
        
               # TODO: Add err handling
               zip_fn = blast_indexes[0]['data']['handle']['file_name']
               #pprint(blast_indexes[0])
              
               self.__LOGGER.info("Downloading the genome index")
               #hs = HandleService(url=self.__HS_URL, token=user_token)
               try:
                   script_util.download_file_from_shock(self.__LOGGER,
                                   shock_service_url= blast_indexes[0]['data']['handle']['url'],
                                   shock_id= blast_indexes[0]['data']['handle']['id'],
                                   filename= blast_indexes[0]['data']['handle']['file_name'],
                                   directory= '.',
                                   token = user_token)
               except Exception, e:
                   self.__LOGGER.error("Downloading error from shock: Please contact [email protected]")
                   raise KBaseGenomeUtilException("Downloading error from shock: Please contact [email protected]")
               try:
                   script_util.unzip_files(self.__LOGGER, zip_fn, blast_dir)
               except Exception, e:
                   self.__LOGGER.error("Unzip indexfile error: Please contact [email protected]")
                   raise KBaseGenomeUtilException("Unzip indexfile error: Please contact [email protected]")
 def index_genomes(self, ctx, params):
     # ctx is the context object
     # return variables are: returnVal
     #BEGIN index_genomes
     user_token=ctx['token']
     svc_token = Token(user_id=self.__SVC_USER, password=self.__SVC_PASS).token
     ws_client=Workspace(url=self.__WS_URL, token=user_token)
     hs = HandleService(url=self.__HS_URL, token=user_token)
     gs = {'elements' : {}}
     try:
         self.__LOGGER.info( "Preparing Target FA")
      
         blast_dir =self.__BLAST_DIR
         if os.path.exists(blast_dir):
             files=glob.glob("%s/*" % blast_dir)
             for f in files: os.remove(f)
         if not os.path.exists(blast_dir): os.makedirs(blast_dir)
       
      
            
         target_nt_fn = "%s/%s_nt.fa" %( blast_dir, params['blastindex_name'])
         target_aa_fn = "%s/%s_aa.fa" %( blast_dir, params['blastindex_name'])
      
         try:
           target_nt=open(target_nt_fn,'w')
           target_aa=open(target_aa_fn,'w')
         except:
           self.__LOGGER.error("Couldn't open file")
           raise KBaseGenomeUtilException("Backend awe client error: Couldn't open files")
      
         have_nt_seq = False
         have_aa_seq = False
      
      
      
         # Iterate one at a time to cope with main memory limit for euk genomes
         for genome_id in params['genome_ids']: 
      
             try:
                 obj_infos = ws_client.get_object_info_new({"objects": [{'name':genome_id, # replace `0' with loop
                                                            'workspace': params['ws_id']}]})
             except:
                 self.__LOGGER.error("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                 raise KBaseGenomeUtilException("Couldn't retrieve %s:%s from the workspace" %(params['ws_id'],genome_id))
                  
      
             if len(obj_infos) < 1:
                 self.__LOGGER.error("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id))
                 continue
                 #err_msg += "Workspace error: Couldn't find %s:%s from the workspace\n" %(params['ws_id'],genome_id)                
                 # we can continue due to multiple genomes
                 #raise Exception("Couldn't find %s:%s from the workspace" %(params['ws_id'],genome_id)) 
      
             ref_id = "{0}/{1}/{2}".format(obj_infos[0][6],obj_infos[0][0],obj_infos[0][4])
             gs['elements'][genome_id] = [ref_id]
            
             self.__LOGGER.info( "Downloading genome object from workspace {0}".format(ref_id))
            
             # TODO: make the following procedures to be loop for each genome_ids 
             try:
                 genome_list=ws_client.get_object_subset([{'name':genome_id, # replace `0' with loop
                                                           'workspace': params['ws_id'], 
                                                           'included':['features']}])
                 #genome_list=ws_client.get_objects([{'name':genome_id, # replace `0' with loop
                 #                                          'workspace': params['ws_id']}])
                 genome = genome_list[0]
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to download genome object itself even though we got the object information")
  
            
            
             self.__LOGGER.info( "Dumping seq for %s" % genome_id)
             # Dump genome sequences
             check_seq=0
             #extract protein sequences from the genome object
             try:
                 for gene in genome['data']['features']:
                       #>kb.g.1234.CDS.1234#At1g3333 amalase...
                       function = "NA"
                       aliases = "NA"
                       if 'function' in gene: 
                           function = gene['function']
                       if 'aliases' in gene: aliases = ",".join(gene['aliases'])
                       if 'protein_translation' in gene:
                             target_aa.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['protein_translation']))
                             have_aa_seq = True
                       if 'dna_sequence' in gene:
                             target_nt.write(">%s#%s#%s#%s\n%s\n" % (gene['id'], ref_id, aliases, function, gene['dna_sequence']))
                             have_nt_seq = True
             except Exception as e:
                 raise KBaseGenomeUtilException("Failed to dump target sequence for genome : %s" % genome_id)
         try:
             target_nt.close()
             target_aa.close()
         except Exception as e:
             raise KBaseGenomeUtilException("Failed to close sequence files")
             
             
            
         if not have_nt_seq :
             self.__LOGGER.info("The genome objects do not contain any dna sequences!")
         if not have_aa_seq :
             self.__LOGGER.info("The genome objects do not contain any amino acid sequences!")
      
         index_type = 'none'
            
         if have_nt_seq :
             try:
                 cmdstring="%s -i %s -p F" %(self.__INDEX_CMD, target_nt_fn)
                 # TODO: replace it to subprocess.Popen
                 tool_process = subprocess.Popen(cmdstring, stderr=subprocess.PIPE, shell=True)
                 stdout, stderr = tool_process.communicate()
                 
                 if stdout is not None and len(stdout) > 0:
                     self.__LOGGER.info(stdout)
                 
                 if stderr is not None and len(stderr) > 0:
                     self.__LOGGER.error("Indexing error: " + stderr)
                     raise KBaseGenomeUtilException("Indexing error: " + stderr)
             except Exception, e:
                 raise KBaseGenomeUtilException("Failed to run indexing program (%s) : %s " %(self.__INDEX_CMD, e))
                
             index_type = 'nucleotide'
Exemple #7
0
    def get_promoter_for_gene(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_promoter_for_gene
        #code goes here
        dfu = DataFileUtil(self.callback_url)
        #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]}
        objectRefs = {'object_refs': [params['featureSet_ref']]}
        ws = Workspace('https://appdev.kbase.us/services/ws')
        ws_name = params['workspace_name']
        subset = ws.get_object_subset([{
            'included':
            ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'],
            'ref':
            params['genome_ref']
        }])
        features = subset[0]['data']['features']
        aref = subset[0]['data']['assembly_ref']
        objects = dfu.get_objects(objectRefs)
        #genome = objects['data'][0]['data']
        #featureSet = objects['data'][1]['data']
        featureSet = objects['data'][0]['data']
        assembly_ref = {'ref': aref}
        #print assembly_ref
        #with open(self.shared_folder + '/genome.json','w') as f:
        #    json.dump(genome,f)
        #with open(self.shared_folder + '/featureSet.json','w') as f:
        #    json.dump(featureSet,f)
        #with open('/kb/module/work/asssembly.json','w') as f:
        #    json.dump(assembly,f)
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)

        #pprint(fasta_file)
        #loop over featureSet
        #find matching feature in genome
        #get record, start, orientation, length
        #TODO: add some error checking logic to the bounds of the promoter
        prom = ""
        featureFound = False
        for feature in featureSet['elements']:
            #print(feature)
            #print(featureSet['elements'][feature])
            featureFound = False
            for f in features:
                #print f['id']
                #print feature
                if f['id'] == feature:
                    attributes = f['location'][0]
                    featureFound = True
                    #print('found match ' + feature)
                    #print(f['location'])
                    break
            if featureFound:
                for record in SeqIO.parse(fasta_file['path'], 'fasta'):
                    #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'):
                    #print(record.id)
                    #print(attributes[0])
                    if record.id == attributes[0]:
                        #print('adding to prom string')
                        #print(attributes[0])
                        if attributes[2] == '+':
                            #print('1')
                            #might need to offset by 1?
                            end = attributes[1]
                            start = end - params['promoter_length']
                            if end < 0:
                                end = 0
                            promoter = record.seq[start:end].upper()
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        elif attributes[2] == '-':
                            #print('2')
                            start = attributes[1]
                            end = start + params['promoter_length']
                            if end > len(record.seq) - 1:
                                end = len(record.seq) - 1
                            promoter = record.seq[start:end].upper()
                            complement = {
                                'A': 'T',
                                'C': 'G',
                                'G': 'C',
                                'T': 'A',
                                'N': 'N'
                            }
                            promoter = ''.join(
                                [complement[base] for base in promoter[::-1]])
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        else:
                            print('Error on orientation')
            else:
                print('Could not find feature ' + feature + 'in genome')
        promOutputPath = '/kb/module/work/tmp/promFile.fa'
        #print('prom string\n' + str(prom))
        with open(promOutputPath, 'w') as promFile:
            promFile.write(str(prom))

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        html_output_dir = os.path.join(self.shared_folder,
                                       'output_html.' + str(timestamp))
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)
        html_file = 'promoter.html'
        output_html_file_path = os.path.join(html_output_dir, html_file)

        html_report_lines = '<html><body>'
        html_report_lines += '<pre>' + prom + '</pre>'
        html_report_lines += '</body></html>'

        with open(output_html_file_path, 'w', 0) as html_handle:
            html_handle.write(str(html_report_lines))

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': html_output_dir,
                #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path,
                #'make_handle': 0})
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [],
            'message': '',
            'direct_html': None,
            'direct_html_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 220,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': html_file,
            'label': 'View'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #changing output to be path string
        #TODO: get rid of this html maybe and move into find_motifs
        output = promOutputPath

        #iterate over records in fasta
        #for record in SeqIO.parse(fasta_file['path'], 'fasta'):

        #objects list of Genome and featureSet

        #pprint(objects)
        #END get_promoter_for_gene

        # At some point might do deeper type checking...
        if not isinstance(output, basestring):
            raise ValueError('Method get_promoter_for_gene return value ' +
                             'output is not type basestring as required.')
        # return the results
        return [output]