Ejemplo n.º 1
0
    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass
Ejemplo n.º 2
0
def create_gtf_annotation_from_genome(logger,ws_client,hs_client,urls,ws_id,genome_ref,genome_id,fasta_file,directory,token):
        try:
		#tmp_file = os.path.join(directory,genome_id + "_GFF.gff")
                ## get the GFF
		genome = GenomeFileUtil(urls['callback_url'])
		ret = genome.genome_to_gff({'genome_ref':genome_ref})
		file_path = ret['file_path']
		gtf_ext = ".gtf"
		if not file_path.endswith(gtf_ext): 
               		gtf_path = os.path.join(directory,genome_id+".gtf")
                	gtf_cmd = " -E {0} -T -o {1}".format(file_path,gtf_path)
                	try:
                   		logger.info("Executing: gffread {0}".format(gtf_cmd))
                   		cmdline_output = runProgram(None,"gffread",gtf_cmd,None,directory)
                	except Exception as e:
                   		raise Exception("Error Converting the GFF file to GTF using gffread {0},{1}".format(gtf_cmd,"".join(traceback.format_exc())))
		else:
			gtf_path = file_path
                if os.path.exists(gtf_path):
                               annotation_handle = hs_client.upload(gtf_path)
                               a_handle = { "handle" : annotation_handle ,"size" : os.path.getsize(gtf_path), 'genome_id' : genome_ref}
                ##Saving GFF/GTF annotation to the workspace
                res= ws_client.save_objects(
                                        {"workspace":ws_id,
                                         "objects": [{
                                         "type":"KBaseRNASeq.GFFAnnotation",
                                         "data":a_handle,
                                         "name":genome_id+"_GTF_Annotation",
                                        "hidden":1}
                                        ]})
        except Exception as e:
                raise ValueError("Generating GTF file from Genome Annotation object Failed :  {}".format("".join(traceback.format_exc())))
	return gtf_path
    def getGenomeInfo(self, genome_basename, lib_i=0):
        if hasattr(self.__class__, 'genomeInfo_list'):
            try:
                info = self.__class__.genomeInfo_list[lib_i]
                name = self.__class__.genomeName_list[lib_i]
                if info != None:
                    if name != genome_basename:
                        self.__class__.genomeInfo_list[lib_i] = None
                        self.__class__.genomeName_list[lib_i] = None
                    else:
                        return info
            except:
                pass

        # 1) transform genbank to kbase genome object and upload to ws
        shared_dir = "/kb/module/work/tmp"
        genome_data_file = 'data/genomes/' + genome_basename + '.gbff'
        genome_file = os.path.join(shared_dir,
                                   os.path.basename(genome_data_file))
        shutil.copy(genome_data_file, genome_file)

        SERVICE_VER = 'release'
        #SERVICE_VER = 'dev'
        GFU = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                             token=self.getContext()['token'],
                             service_ver=SERVICE_VER)
        print("UPLOADING genome: " + genome_basename + " to WORKSPACE " +
              self.getWsName() + " ...")
        genome_upload_result = GFU.genbank_to_genome({
            'file': {
                'path': genome_file
            },
            'workspace_name':
            self.getWsName(),
            'genome_name':
            genome_basename
        })
        #                                                  })[0]
        pprint(genome_upload_result)
        genome_ref = genome_upload_result['genome_ref']
        new_obj_info = self.getWsClient().get_object_info_new(
            {'objects': [{
                'ref': genome_ref
            }]})[0]

        # 2) store it
        if not hasattr(self.__class__, 'genomeInfo_list'):
            self.__class__.genomeInfo_list = []
            self.__class__.genomeName_list = []
        for i in range(lib_i + 1):
            try:
                assigned = self.__class__.genomeInfo_list[i]
            except:
                self.__class__.genomeInfo_list.append(None)
                self.__class__.genomeName_list.append(None)

        self.__class__.genomeInfo_list[lib_i] = new_obj_info
        self.__class__.genomeName_list[lib_i] = genome_basename
        return new_obj_info
Ejemplo n.º 4
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4()))
     handler_utils._mkdir_p(self.scratch)
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev')
     self.uploader_utils = UploaderUtil(config)
Ejemplo n.º 5
0
    def Xtest_modify_old_genome(self):
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.gfu = GenomeFileUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        old_genome = "30045/15/1"

        new_genome = "30045/14/1"
        genome_name = 'OldRhodo'
        genome_data_old = self.dfu.get_objects({"object_refs":
                                                [old_genome]})["data"][0]
        genome_data_new = self.dfu.get_objects({"object_refs":
                                                [new_genome]})["data"][0]

        sso_1 = {
            "id": "1",
            "evidence": [],
            "term_name": "1",
            "ontology_ref": "1",
            "term_lineage": []
        }

        sso_2 = {
            "id": "2",
            "evidence": [],
            "term_name": "2",
            "ontology_ref": "2",
            "term_lineage": []
        }

        sso_terms = {'SSO1': sso_1, 'SSO2': sso_2}

        print("ABOUT TO MODIFY OLD GENOME")
        for i, item in enumerate(genome_data_old['data']['features']):
            genome_data_old['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO MODIFY NEW GENOME")
        for i, item in enumerate(genome_data_new['data']['features']):
            genome_data_new['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO SAVE OLD GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_old["data"],
            "provenance": self.ctx.provenance()
        })["info"]

        print("ABOUT TO SAVE NEW GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_new["data"],
            "provenance": self.ctx.provenance()
        })["info"]
Ejemplo n.º 6
0
 def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
     self.scratch_dir = scratch_dir
     self.rau = ReadsAlignmentUtils(callback_url)
     self.kbr = KBaseReport(callback_url)
     self.dfu = DataFileUtil(callback_url)
     self.gfu = GenomeFileUtil(callback_url)
     self.set_api = SetAPI(srv_wiz_url)
     self.ws = Workspace(workspace_url)
     self.valid_commands = ['bamqc', 'multi-bamqc']
Ejemplo n.º 7
0
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.workspaceURL = config['workspace-url']
     self.token = os.environ['KB_AUTH_TOKEN']
     self.scratch = os.path.abspath(config['scratch'])
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     #END_CONSTRUCTOR
     pass
Ejemplo n.º 8
0
 def load_genbank_file(self, local_file, target_name):
     gfu = GenomeFileUtil(self.callback_url)
     genome_ref = gfu.genbank_to_genome({
         "file": {
             "path": local_file
         },
         "genome_name": target_name,
         "workspace_name": self.ws_name,
         "source": "RefSeq",
         "release": "NC_003098.1",
         "genetic_code": 11,
         "generate_ids_if_needed": 1,
         "type": "User upload"
     })
     return genome_ref.get('genome_ref')  # yeah, i know.
Ejemplo n.º 9
0
 def loadGenome(self):
     if hasattr(self.__class__, 'genome_ref'):
         return self.__class__.genome_ref
     genbank_file_path = os.path.join(self.scratch, 'minimal.gbff')
     shutil.copy(os.path.join('data', 'minimal.gbff'), genbank_file_path)
     gfu = GenomeFileUtil(self.callback_url)
     genome_ref = gfu.genbank_to_genome({'file': {'path': genbank_file_path},
                                         'workspace_name': self.getWsName(),
                                         'genome_name': 'test_genome',
                                         'source': 'Ensembl',
                                         'generate_ids_if_needed': 1,
                                         'generate_missing_genes': 1
                                         })['genome_ref']
     self.__class__.genome_ref = genome_ref
     return genome_ref
Ejemplo n.º 10
0
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('kb_plant_rast'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'kb_plant_rast',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL)
     cls.serviceImpl = kb_plant_rast(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.gfu = GenomeFileUtil(cls.callback_url)
     cls.genome = "Test_Genome"
     cls.prepare_data()
Ejemplo n.º 11
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)

        for nameval in config.items('kb_cufflinks'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_cufflinks',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(url=cls.wsURL, token=token)
        cls.serviceImpl = kb_cufflinks(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = environ.get('SDK_CALLBACK_URL')
        cls.srv_wiz_url = cls.cfg['srv-wiz-url']

        # cls.wsName = 'cufflinks_test_' + user_id  # reuse existing workspace
        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_cufflinks_" + str(suffix)
        print('workspace_name: ' + cls.wsName)

        try:
            # reuse existing (previously torn down) workspace
            cls.wsClient.undelete_workspace({'workspace': cls.wsName})
            print('reusing old workspace...')
        except BaseException:
            try:
                # create if workspace does not exist
                cls.wsClient.create_workspace({'workspace': cls.wsName})
            except BaseException:
                # get workspace if it exists and was not previously deleted (previously
                # not torn down)
                ws_info = cls.wsClient.get_workspace_info({'workspace': cls.wsName})
                print("creating new workspace: " + str(ws_info))

        cls.dfu = DataFileUtil(cls.callback_url)

        cls.gfu = GenomeFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url)
        cls.set_api = SetAPI(cls.srv_wiz_url, service_ver='dev')

        cls.cufflinks_runner = CufflinksUtils(cls.cfg)

        cls.prepare_data()
Ejemplo n.º 12
0
 def setUpClass(cls):
     cls.token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('DifferentialExpressionUtils'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(cls.token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': cls.token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'DifferentialExpressionUtils',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.shockURL = cls.cfg['shock-url']
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL)
     cls.serviceImpl = DifferentialExpressionUtils(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.dfu = DataFileUtil(cls.callback_url)
     cls.gfu = GenomeFileUtil(cls.callback_url)
     suffix = int(time.time() * 1000)
     cls.wsName = "test_DifferentialExpressionUtils_" + str(suffix)
     cls.wsClient.create_workspace({'workspace': cls.wsName})
     cls.setupTestData()
Ejemplo n.º 13
0
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None
Ejemplo n.º 14
0
def load_genbank_file(callback_url, ws_name, local_file, target_name):
    """
    Loads a Genbank (.gbk/.gbff/etc.) file into a workspace as a Genome object. This
    has the side effect of building an Assembly to contain the genome sequence.
    """
    gfu = GenomeFileUtil(callback_url)
    genome_ref = gfu.genbank_to_genome({
        "file": {
            "path": local_file
        },
        "genome_name": target_name,
        "workspace_name": ws_name,
        "source": "RefSeq",
        "type": "User upload",
        "generate_ids_if_needed": 1
    })
    return genome_ref.get('genome_ref')  # yeah, i know.
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN', None)
        cls.callbackURL = environ.get('SDK_CALLBACK_URL')
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('ExpressionUtils'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'ExpressionUtils',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.shockURL = cls.cfg['shock-url']
        cls.wsURL = cls.cfg['workspace-url']
        cls.service_wizard_url = cls.cfg['srv-wiz-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.ws = Workspace(cls.wsURL, token=cls.token)
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)
        # create workspace
        wssuffix = int(time.time() * 1000)
        wsname = "test_expression_" + str(wssuffix)
        cls.wsinfo = cls.wsClient.create_workspace({'workspace': wsname})
        print('created workspace ' + cls.getWsName())

        cls.serviceImpl = ExpressionUtils(cls.cfg)
        cls.readUtils = ReadsUtils(cls.callbackURL)
        cls.dfu = DataFileUtil(cls.callbackURL, service_ver='dev')
        cls.dfu.ws_name_to_id(wsname)
        cls.assemblyUtil = AssemblyUtil(cls.callbackURL)
        cls.gfu = GenomeFileUtil(cls.callbackURL)
        cls.gaAPI = GenomeAnnotationAPI(cls.service_wizard_url)
        cls.rau = ReadsAlignmentUtils(cls.callbackURL)
        cls.scratch = cls.cfg['scratch']

        cls.staged = {}
        cls.nodes_to_delete = []
        cls.handles_to_delete = []
        cls.setupTestData()
Ejemplo n.º 16
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_Msuite'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_Msuite',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_Msuite(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_Msuite_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                                 service_ver='dev')
        cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])

        # stage an input and output directory
        """
        cls.input_dir = os.path.join(cls.scratch, 'input_1')
        cls.output_dir = os.path.join(cls.scratch, 'output_1')
        cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna')
        shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir)
        shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir)
        shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta)
        """

        # prepare WS data
        cls.prepare_data()
Ejemplo n.º 17
0
 def _get_genome_gtf_file(self, gnm_ref, gtf_file_dir):
     """
     Get data from genome object ref and return the GTF filename (with path)
     for STAR indexing and mapping.
     STAR uses the reference annotation to guide assembly and for creating alignment
     """
     log("Converting genome {0} to GFF file in folder {1}".format(gnm_ref, gtf_file_dir))
     gfu = GenomeFileUtil(self.callback_url)
     try:
         gfu_ret = gfu.genome_to_gff({self.PARAM_IN_GENOME: gnm_ref,
                                      'is_gtf': 1,
                                      'target_dir': gtf_file_dir
                                   })
     except ValueError as egfu:
         log('GFU getting GTF file raised error:\n')
         pprint(egfu)
         return None
     else:#no exception raised
         return gfu_ret.get('file_path')
Ejemplo n.º 18
0
    def loadGenome(self, gbff_file):
        if hasattr(self.__class__, 'genome_ref'):
            return self.__class__.genome_ref
        gbff_file_name = os.path.basename(gbff_file)
        genome_file_path = os.path.join(self.scratch, gbff_file_name)
        shutil.copy(gbff_file, genome_file_path)

        gfu = GenomeFileUtil(self.callback_url)
        genome_ref = gfu.genbank_to_genome({
            'file': {
                'path': genome_file_path
            },
            'workspace_name':
            self.getWsName(),
            'genome_name':
            gbff_file_name.split('.')[0]
        })['genome_ref']
        self.__class__.genome_ref = genome_ref
        return genome_ref
Ejemplo n.º 19
0
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_ballgown'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_ballgown',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)
        cls.shockURL = cls.cfg['shock-url']
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
        cls.serviceImpl = kb_ballgown(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.gfu = GenomeFileUtil(cls.callback_url)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url, service_ver='dev')
        cls.eu = ExpressionUtils(cls.callback_url, service_ver='dev')
        cls.set_api = SetAPI(cls.callback_url)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_ballgown_" + str(suffix)
        #cls.wsName = "test_kb_ballgown_1004"
        cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.nodes_to_delete = []
        cls.handles_to_delete = []

        cls.prepare_data()
Ejemplo n.º 20
0
 def test_annotate_genes(self):
     gbk_file = "/kb/module/test/data/kb_g.399.c.1.gbk.gz"
     temp_gbk = "/kb/module/work/tmp/kb_g.399.c.1.gbk.gz"
     shutil.copy(gbk_file, temp_gbk)
     genome_obj = "Genome.1"
     gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                          token=self.getContext()['token'])
     gfu.genbank_to_genome({
         'file': {
             'path': temp_gbk
         },
         'workspace_name': self.getWsName(),
         'genome_name': genome_obj
     })
     genome_ref = self.getWsName() + '/' + genome_obj
     self.getImpl().annotate_genes(
         self.getContext(), {
             'input_genome_ref': genome_ref,
             'output_workspace': self.getWsName(),
             'output_genome_name': genome_obj
         })
Ejemplo n.º 21
0
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.srv_wiz_url = config['srv-wiz-url']
     self.scratch = config['scratch']
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     self.rau = ReadsAlignmentUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.eu = ExpressionUtils(self.callback_url, service_ver='dev')
     self.ws = Workspace(self.ws_url, token=self.token)
     self.set_client = SetAPI(self.srv_wiz_url)
Ejemplo n.º 22
0
    def get_genome_data_files(self, genome_ref):
        genome_files = {"assembly": None, "gff": None}
        print('Fetching assembly or contig information from genome...')
        assembly_ref = self._get_assembly_ref(genome_ref)
        if len(assembly_ref) > 1:
            raise ValueError(
                'This genome, {}, appears to reference {} Assemblies or ContigSets, with these object references: {}'
                .format(genome_ref, len(assembly_ref), assembly_ref))
        elif len(assembly_ref) == 0:
            raise ValueError(
                'There was no Assembly or ContigSet found as a reference to this genome. Unable to build browser data.'
            )
        print('Done! Found valid assembly data.')

        print('Converting sequence data to FASTA file...')
        au = AssemblyUtil(self.callback_url)
        fasta_file = au.get_assembly_as_fasta({'ref': assembly_ref[0]})
        print('Done! FASTA file created: {}'.format(fasta_file))

        if "path" not in fasta_file:
            raise IOError(
                'FASTA file was not apparently generated from the given genome fasta_file. fasta_file object missing key "path": {}'
                .format(fasta_file))
        genome_files["assembly"] = fasta_file.get('path', None)

        print('Converting genome annotation data to gff file...')
        gfu = GenomeFileUtil(self.callback_url)
        gff_file = gfu.genome_to_gff({'genome_ref': genome_ref})
        print('Done! GFF file created: {}'.format(gff_file))
        if "file_path" not in gff_file:
            raise IOError(
                'GFF file was not apparently generated from the given genome. gff_file object missing key "file_path": {}'
                .format(gff_file))
        genome_files["gff"] = gff_file.get('file_path', None)

        return genome_files
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_QualiMap'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_QualiMap',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.ws = Workspace(cls.wsURL)
        cls.serviceImpl = kb_QualiMap(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.srv_wiz_url = cls.cfg['srv-wiz-url']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.gfu = GenomeFileUtil(cls.callback_url)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url)
        cls.au = AssemblyUtil(cls.callback_url)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_qualimap_" + str(suffix)
        cls.ws.create_workspace({'workspace': cls.wsName})

        cls.prepare_data()
Ejemplo n.º 24
0
 def __init__(self, config, services, logger=None):
     self.config = config
     self.logger = logger
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'],
                                 'cuffdiff_merge_' + str(uuid.uuid4()))
     self.ws_url = config['workspace-url']
     self.services = services
     self.ws_client = Workspace(self.services['workspace_service_url'])
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     self.rau = ReadsAlignmentUtils(self.callback_url)
     self.eu = ExpressionUtils(self.callback_url)
     self.deu = DifferentialExpressionUtils(self.callback_url)
     self.cuffmerge_runner = CuffMerge(config, logger)
     self.num_threads = mp.cpu_count()
     handler_utils._mkdir_p(self.scratch)
Ejemplo n.º 25
0
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_uploadmethods'):
            cls.cfg[nameval[0]] = nameval[1]
        authServiceUrl = cls.cfg.get(
            'auth-service-url',
            "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        cls.user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            cls.user_id,
            'provenance': [{
                'service': 'kb_uploadmethods',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
        cls.serviceImpl = kb_uploadmethods(cls.cfg)
        cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token)
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                                 token=cls.token)
        cls.fba_tools = fba_tools(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.token)
        cls.scratch = cls.cfg['scratch']
        cls.shockURL = cls.cfg['shock-url']

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_uploadmethods_phenotype_set" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.prepare_data()
Ejemplo n.º 26
0
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.ctx = {'token': token}
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('kb_cummerbund'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.scratch = cls.cfg['scratch']
     cls.wsURL = cls.cfg['ws_url']
     cls.gfu = GenomeFileUtil(cls.callback_url)
     cls.ru = ReadsUtils(cls.callback_url)
     cls.rau = ReadsAlignmentUtils(cls.callback_url, service_ver='dev')
     cls.eu = ExpressionUtils(cls.callback_url, service_ver='dev')
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.dfu = DataFileUtil(cls.callback_url, token=token)
     cls.serviceImpl = kb_cummerbund(cls.cfg)
     cls.prepare_data()
Ejemplo n.º 27
0
 def get_gff_file(self, genome_ref):
     gfu = GenomeFileUtil(self.callback_url)
     gff_file = gfu.genome_to_gff({'genome_ref': genome_ref})
     return gff_file["file_path"]
Ejemplo n.º 28
0
class ProkkaUtils:

    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key + " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4: -1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple("renamed_assembly",
                                      "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old,
                                records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The  directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix",
                           "mygenome", "--kingdom", kingdom]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError("gram parameter is not supported in current Prokka installation")
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {"id": fid, "location": location, "type": "gene",
                               "aliases": aliases, "md5": md5, "dna_sequence": dna,
                               "dna_sequence_length": len(dna),
                               }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                         "evidence": [evidence],
                                                         "term_name": sso_item["name"],
                                                         "ontology_ref": self.sso_ref,
                                                         "term_lineage": []}
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid,
                               "parent_mrna": mrna_id, "function": (product if product else ""),
                               "ontology_terms": {}, "protein_translation": prot,
                               "protein_translation_length": prot_len, "aliases": aliases}
                        mrna = {"id": mrna_id, "location": location, "md5": md5,
                                "parent_gene": fid, "cds": cds_id}
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n"
        report += "Average protein length: " + str(int(sum(prot_lengths) /
                                                       float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                             "evidence": [evidence],
                                                             "term_name": sso_item["name"],
                                                             "ontology_ref": self.sso_ref,
                                                             "term_lineage": []}

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(self.scratch,
                                                 "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.")

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """

        :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name
        :type
        :return:
        """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0,
                 "found_functions": 0, "new_ontologies": 0}

        function_report_filepath = os.path.join(self.scratch, "ontology_report")
        ontology_report_filepath = os.path.join(self.scratch, "function_report")
        onto_r = open(function_report_filepath, "w")
        func_r = open(ontology_report_filepath, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i]["function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [new_function]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)
                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)
            if current_function:
                func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n")
            else:
                func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n")

            onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        info = self.gfu.save_one_genome({"workspace": self.output_workspace,
                                         "name": annotation_args["output_genome_name"],
                                         "data": genome_data["data"],
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        annotated_genome = namedtuple("annotated_genome",
                                      "genome_ref function_report_filepath ontology_report_filepath stats")

        return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath,
                                stats)

    def upload_file(self, filepath, message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id)
        return {"shock_id": output_file_shock_id,
                "name": os.path.basename(filepath),
                "label": os.path.basename(filepath),
                "description": message}

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [self.upload_file(genome.ontology_report_filepath),
                      self.upload_file(genome.function_report_filepath)]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n"
                          ).format(genome_ref, stats["current_functions"], stats["new_functions"],
                                   stats["new_ontologies"])

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "file_links": file_links,
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": self.output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data,
                                                                     new_annotations=new_annotations,
                                                                     output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params, "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath,
                                                       cds_to_dna=prokka_results.cds_to_dna,
                                                       cds_to_prot=prokka_results.cds_to_dna,
                                                       new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {"id": "Unknown",
                  "features": annotated_assembly.features,
                  "scientific_name": scientific_name,
                  "domain": domain,
                  "genetic_code": gcode,
                  "assembly_ref": assembly_ref,
                  "cdss": annotated_assembly.cdss,
                  "mrnas": annotated_assembly.mrnas,
                  "source": "PROKKA annotation pipeline",
                  "gc_content": assembly_info.gc_content,
                  "dna_size": assembly_info.dna_size,
                  "reference_annotation": 0}

        info = self.gfu.save_one_genome({"workspace": output_workspace,
                                         "name": output_genome_name,
                                         "data": genome,
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}
Ejemplo n.º 29
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
Ejemplo n.º 30
0
class ImportGenbankUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)

    def import_genbank_from_staging(self, params):
        '''
          import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          genome_name - becomes the name of the object
          workspace_name - the name of the workspace it gets saved to.
          source - Source of the file typically something like RefSeq or Ensembl

          optional params:
          release - Release or version number of the data
              per example Ensembl has numbered releases of all their data: Release 31
          generate_ids_if_needed - If field used for feature id is not there,
              generate ids (default behavior is raising an exception)
          genetic_code - Genetic code of organism. Overwrites determined GC from
              taxon object
          type - Reference, Representative or User upload

          return:
          genome_ref: return object reference
        '''

        log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_genbank_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_genbank_params = params
        import_genbank_params['file'] = file
        del import_genbank_params['staging_file_subdir_path']

        returnVal = self.gfu.genbank_to_genome(import_genbank_params)

        return returnVal

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')