Esempio n. 1
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_Msuite'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_Msuite',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_Msuite(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_Msuite_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                                 service_ver='dev')
        cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])

        # stage an input and output directory
        """
        cls.input_dir = os.path.join(cls.scratch, 'input_1')
        cls.output_dir = os.path.join(cls.scratch, 'output_1')
        cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna')
        shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir)
        shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir)
        shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta)
        """

        # prepare WS data
        cls.prepare_data()
Esempio n. 2
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('MetagenomeAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'MetagenomeAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = MetagenomeAPI(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_maxbin_" + str(suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})

        # you could bypass creating objects for more rapid testing here
        # cls.binnedcontigs_ref_1 = '19621/2/1'
        # cls.assembly_ref_1 = '19621/1/1'
        # return

        # create some test data
        cls.au = AssemblyUtil(cls.callback_url)
        cls.mu = MetagenomeUtils(cls.callback_url)
        cls.dfu = DataFileUtil(cls.callback_url)

        # building Assembly
        assembly_filename = 'small_bin_contig_file.fasta'
        cls.assembly_fasta_file_path = os.path.join(cls.scratch, assembly_filename)
        shutil.copy(os.path.join("data", assembly_filename), cls.assembly_fasta_file_path)

        assembly_params = {
            'file': {'path': cls.assembly_fasta_file_path},
            'workspace_name': cls.wsName,
            'assembly_name': 'MyAssembly'
        }
        print(os.path.isfile(cls.assembly_fasta_file_path))

        cls.assembly_ref_1 = cls.au.save_assembly_from_fasta(assembly_params)
        print('Assembly1:' + cls.assembly_ref_1)

        # stage and build BinnedContigs data
        test_directory_name = 'test_maxbindata'
        cls.test_directory_path = os.path.join(cls.scratch, test_directory_name)
        os.makedirs(cls.test_directory_path)
        print(os.listdir(cls.test_directory_path))
        for item in os.listdir(os.path.join("data", "MaxBin_Result_Sample")):
            shutil.copy(os.path.join("data", "MaxBin_Result_Sample", item),
                        os.path.join(cls.test_directory_path, item))

        cls.binnedcontigs_ref_1 = cls.mu.file_to_binned_contigs({'file_directory': cls.test_directory_path,
                                                                 'assembly_ref': cls.assembly_ref_1,
                                                                 'binned_contig_name': 'MyBins',
                                                                 'workspace_name': cls.wsName
                                                                 })['binned_contig_obj_ref']
        print('BinnedContigs1:' + cls.binnedcontigs_ref_1)
Esempio n. 3
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(
            self.scratch,
            'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{
            'ref': input_ref
        }]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        obj_name = input_info[NAME_I]
        type_name = input_info[TYPE_I].split('-')[0]

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL,
                                    token=self.ctx['token'],
                                    service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL,
                                        token=self.ctx['token'],
                                        service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # Standard Single Assembly
        #
        if type_name in [
                'KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet'
        ]:
            # create file data
            filename = os.path.join(input_dir,
                                    obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({
                'ref': input_ref,
                'filename': filename
            })
            if not os.path.isfile(filename):
                raise ValueError(
                    'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                )
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError(
                    'Assembly or ContigSet is empty in filename: ' +
                    str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1({
                    'ref':
                    input_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' +
                                 input_ref + ')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new(
                        {'objects': [{
                            'ref': this_assembly_ref
                        }]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     this_assembly_ref + '): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({
                'input_ref': input_ref,
                'save_to_shock': 0
            })['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join(input_dir, fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path,
                                                       min_fasta_len):
                        raise ValueError(
                            'Binned Assembly is empty for fasta_path: ' +
                            str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2(
                        {'objects': [{
                            'ref': input_ref
                        }]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch ' + str(input_ref) +
                                     ' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError(
                            'genome_ref not found for genome_id: ' +
                            str(genome_id) + ' in genomeSet: ' +
                            str(input_ref))
                    else:
                        genomeSet_refs.append(
                            genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i, this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2(
                        {'objects': [{
                            'ref': this_input_ref
                        }]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " +
                                     this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError(msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    print(msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i, assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir,
                                        this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({
                    'ref': assembly_ref,
                    'filename': filename
                })
                if not os.path.isfile(filename):
                    raise ValueError(
                        'Error generating fasta file from an Assembly or ContigSet with AssemblyUtil'
                    )
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError(
                        'Assembly or ContigSet is empty in filename: ' +
                        str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError(
                'Cannot stage fasta file input directory from type: ' +
                type_name)

        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {
            'input_dir': input_dir,
            'folder_suffix': suffix,
            'all_seq_fasta': all_seq_fasta
        }
Esempio n. 4
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)


        # 2) based on type, download the files
        ws = Workspace(self.ws_url)
        input_info = ws.get_object_info3({'objects': [{'ref': input_ref}]})['infos'][0]
        # 0 obj_id objid - the numerical id of the object.
        # 1 obj_name name - the name of the object.
        # 2 type_string type - the type of the object.
        # 3 timestamp save_date - the save date of the object.
        # 4 obj_ver ver - the version of the object.
        # 5 username saved_by - the user that saved or copied the object.
        # 6 ws_id wsid - the workspace containing the object.
        # 7 ws_name workspace - the workspace containing the object.
        # 8 string chsum - the md5 checksum of the object.
        # 9 int size - the size of the object in bytes.
        # 10 usermeta meta - arbitrary user-supplied metadata about
        #     the object.
        obj_name = input_info[1]
        type_name = input_info[2].split('-')[0]
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            au = AssemblyUtil(self.callback_url)
            os.makedirs(input_dir)
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            au.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            pass
        elif type_name == 'KBaseMetagenomes.BinnedContigs':
            # download the bins as fasta and set the input folder name
            au = MetagenomeUtils(self.callback_url)
            bin_file_dir = au.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
        elif type_name == 'KBaseGenomes.Genome':
            raise ValueError('Cannot yet stage fasta file input directory from KBaseGenomes.Genome')
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}