Ejemplo n.º 1
0
def load_fastas(config, scratch: str, upa: str):
    '''
    Returns list of (fasta_path, upa)
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    obj_type = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]
    elif "KBaseSets.AssemblySet" in obj_type:
        fasta_paths = []
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({"ref": item_upa['ref']})
            fasta_paths.append((faf['path'], item_upa['ref']))
        return fasta_paths
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        fasta_paths = []
        bin_file_dir = mgu.binned_contigs_to_file({
            'input_ref': upa,
            'save_to_shock': 0
        })['bin_file_directory']
        for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
            for fasta_file in filenames:
                fasta_path = os.path.join(scratch, fasta_file)
                fasta_path = os.path.splitext(fasta_path)[0] + ".fa"
                copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path)
                # Should I verify that the bins have contigs?
                # is it possible to have empty bins?
                fasta_paths.append((fasta_path, upa))
            break
        return fasta_paths
    else:
        raise Error('Input genome/metagenome reference has unhandled type')

    fasta_paths = []
    for genome_upa in upas:
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        assembly_upa = genome_upa + ';' + str(
            genome_data.get('contigset_ref')
            or genome_data.get('assembly_ref'))
        faf = au.get_assembly_as_fasta({'ref': assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
Ejemplo n.º 2
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.ru = ReadsUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.mgu = MetagenomeUtils(self.callback_url)
Ejemplo n.º 3
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_Msuite'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_Msuite',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = kb_Msuite(cls.cfg)
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.scratch = cls.cfg['scratch']
        cls.suffix = int(time.time() * 1000)
        #cls.scratch = cls.cfg['scratch']+'_'+str(suffix)
        #cls.cfg['scratch'] = cls.scratch
        #if not os.path.exists(cls.scratch):
        #    os.mkdir(cls.scratch)
        cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx)

        cls.wsName = "test_kb_Msuite_" + str(cls.suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'], service_ver='dev')
        cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])

        # stage an input and output directory
        """
        cls.input_dir = os.path.join(cls.scratch, 'input_1')
        cls.output_dir = os.path.join(cls.scratch, 'output_1')
        cls.all_seq_fasta = os.path.join(cls.scratch, 'all_seq.fna')
        shutil.copytree(os.path.join('data', 'example_out', 'input'), cls.input_dir)
        shutil.copytree(os.path.join('data', 'example_out', 'output'), cls.output_dir)
        shutil.copy(os.path.join('data', 'example_out', 'all_seq.fna'), cls.all_seq_fasta)
        """

        # prepare WS data
        cls.prepare_data()
Ejemplo n.º 4
0
    def save_binned_contigs(self, params, assembly_ref, filtered_bins_dir):
        try:
            mgu = MetagenomeUtils(self.callback_url)
        except:
            raise ValueError("unable to connect with MetagenomeUtils")

        filtered_binned_contig_obj_name = params.get(
            'output_filtered_binnedcontigs_obj_name')
        generate_binned_contig_param = {
            'file_directory': filtered_bins_dir,
            'assembly_ref': assembly_ref,
            'binned_contig_name': filtered_binned_contig_obj_name,
            'workspace_name': params.get('workspace_name')
        }
        filtered_binned_contig_obj_ref = mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        return {
            'obj_name': filtered_binned_contig_obj_name,
            'obj_ref': filtered_binned_contig_obj_ref
        }
Ejemplo n.º 5
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        test_time_stamp = int(time.time() * 1000)

        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_Msuite'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_Msuite',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.serviceImpl = kb_Msuite(cls.cfg)
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.scratch = cls.cfg['scratch']
        cls.appdir = cls.cfg['appdir']

        cls.test_data_dir = os.path.join(cls.scratch, 'test_data')
        cls.suffix = test_time_stamp
        cls.checkm_runner = CheckMUtil(cls.cfg, cls.ctx)

        cls.wsName = "test_kb_Msuite_" + str(cls.suffix)
        cls.ws_info = cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.gfu = GenomeFileUtil(os.environ['SDK_CALLBACK_URL'],
                                 service_ver='dev')
        cls.mu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])
        cls.setAPI = SetAPI(url=cls.cfg['srv-wiz-url'], token=cls.ctx['token'])
        cls.kr = KBaseReport(os.environ['SDK_CALLBACK_URL'])

        cls.data_loaded = False
Ejemplo n.º 6
0
    def __init__(self, callback_url: str, workspace_url: str, user_token: str):
        '''
        Create the client set.

        :param callback_url: The url of the callback server.
        :param workspace_url: The url of the KBase workspace server.
        :param user_token: The user's token.
        '''
        # TODO check inputs aren't None or empty string
        self._dfu = DataFileUtil(callback_url, token=user_token)
        self._au = AssemblyUtil(callback_url, token=user_token)
        self._mgu = MetagenomeUtils(callback_url, token=user_token)
        self._report = KBaseReport(callback_url, token=user_token)
        self._ws = Workspace(workspace_url, token=user_token)
Ejemplo n.º 7
0
    def setUpClass(cls):
        cls.token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_das_tool'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_das_tool',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.serviceImpl = kb_das_tool(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_das_tool_" + str(suffix)
        # ret = cls.wsClient.create_workspace({'workspace': cls.wsName})  # noqa

        cls.ws_info = cls.wsClient.create_workspace({'workspace':
                                                     cls.wsName})  # noqa
        cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token)
        cls.ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'], token=cls.token)
        cls.au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token)
        cls.mgu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.token)
        cls.das_tool_runner = DASToolUtil(cls.cfg)
        cls.prepare_data()
Ejemplo n.º 8
0
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR

        callback_url = os.environ['SDK_CALLBACK_URL']
        workspace_url = config['workspace-url']
        shared_folder = config['scratch']
        
        reset_globals()
        app.update({ 
            'shared_folder': config['scratch'], 
            'ws': Workspace(workspace_url),
            'dfu': DataFileUtil(callback_url),
            'mgu': MetagenomeUtils(callback_url, service_ver='dev'),
            'au': AssemblyUtil(callback_url),
            'kbr': KBaseReport(callback_url),
        })

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass
Ejemplo n.º 9
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'
        [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
        ws = Workspace(self.ws_url)

        # 1) generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)


        # 2) based on type, download the files
        obj_name = self.get_data_obj_name (input_ref)
        type_name = self.get_data_obj_type (input_ref)

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))


        # Standard Single Assembly
        #
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            # create file data
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1})
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join (input_dir,fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len):
                        raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref))
                    else:
                        genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i,this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError ("unable to fetch genome: "+this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError (msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
Ejemplo n.º 10
0
 def __init__(self, callback_url, scratch, wrkspc, token):
     self.ws = wrkspc
     self.scratch = scratch
     self.callback_url = callback_url
     self.mgu = MetagenomeUtils(callback_url, token=token)
     self.fasta_dict = {}
Ejemplo n.º 11
0
class TypeToFasta:
    def __init__(self, callback_url, scratch, wrkspc, token):
        self.ws = wrkspc
        self.scratch = scratch
        self.callback_url = callback_url
        self.mgu = MetagenomeUtils(callback_url, token=token)
        self.fasta_dict = {}

    def log(self, message, prefix_newline=False):
        print(('\n' if prefix_newline else '') + str(_time.time()) + ': ' +
              message)

    def add_to_dict(self, key, val):
        if key in self.fasta_dict:
            # if key is already dict, we want to add a field to the the 'parent_refs'
            if 'parent_refs' in self.fasta_dict[key]:
                self.fasta_dict[key]['parent_refs'] += val['parent_refs']
        else:
            self.fasta_dict[key] = val

    def genome_obj_to_fasta(self, ref, obj_type):

        # Initiate needed objects
        atf = AssemblyToFasta(self.callback_url, self.scratch)
        upas = []

        if 'KBaseSets.GenomeSet' in obj_type:
            obj_data = self.ws.get_objects2({'objects': [{
                "ref": ref
            }]})['data'][0]
            upas = [gsi['ref'] for gsi in obj_data['data']['items']]
        elif 'KBaseSearch.GenomeSet' in obj_type:
            obj_data = self.ws.get_objects2({'objects': [{
                "ref": ref
            }]})['data'][0]
            upas = [
                gse['ref'] for gse in obj_data['data']['elements'].values()
            ]
        elif "KBaseGenomes.Genome" in obj_type:
            upas = [ref]

        if upas:
            for genome_upa in upas:
                # Get genome object assembly_ref or contigset_ref through subsetting object
                genome_data = self.ws.get_objects2({'objects': \
                            [{"ref": genome_upa, 'included' : ['/assembly_ref/','/contigset_ref/']}]}) \
                            ['data'][0]['data']

                # If genome object contains an assembly_ref or contigset_ref it will return a dictionary, genome_data.
                # If not an empty dictionary will be returned
                if genome_data:
                    # Get assembly_upa and fasta
                    assembly_upa = genome_upa + ';' + \
                                   str(genome_data.get('assembly_ref') or genome_data.get('contigset_ref'))

                    faf = atf.assembly_as_fasta({'ref': assembly_upa})
                    # Input data into object dict
                    self.add_to_dict(
                        assembly_upa, {
                            'paths': [faf['path']],
                            'type': obj_type,
                            'parent_refs': [ref]
                        })

                else:
                    raise TypeError(
                        "KBase object type %s does not contain an assembly reference or contig reference."
                        % obj_type)

    def assembly_obj_to_fasta(self,
                              ref,
                              obj_type,
                              input_ref=None,
                              input_type=None):
        # Initiate needed objects
        atf = AssemblyToFasta(self.callback_url, self.scratch)
        obj = {"ref": ref}

        if "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
            # Get fasta
            faf = atf.assembly_as_fasta(obj)
            if input_ref and input_type:
                self.add_to_dict(
                    input_ref, {
                        'paths': [faf['path']],
                        'type': input_type,
                        'parent_refs': [input_ref, ref]
                    })
            else:
                self.add_to_dict(ref, {
                    'paths': [faf['path']],
                    'type': obj_type,
                    'parent_refs': [ref]
                })

        elif "KBaseSets.AssemblySet" in obj_type:
            # Get assembly set object
            obj_data = self.ws.get_objects2({'objects': [obj]})['data'][0]
            for item_upa in obj_data['data']['items']:
                # Get fasta
                faf = atf.assembly_as_fasta({"ref": item_upa['ref']})
                # Input data into object dict
                self.add_to_dict(item_upa['ref'], {
                    'paths': [faf['path']],
                    'type': obj_type,
                    'parent_refs': [ref]
                })

    def metagenome_obj_to_fasta(self, ref, obj_type):

        if 'KBaseMetagenomes.BinnedContigs' in obj_type:
            fasta_paths = []
            try:
                # Binned_contigs_to_file saves fasta file to a directory in scratch.
                # Path: scratch/binned_contig_files_EXTENSION/Bin#.fasta
                bin_file_dir = self.mgu.binned_contigs_to_file({'input_ref': ref, 'save_to_shock': 0}) \
                    ['bin_file_directory']
                for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
                    for fasta_file in filenames:
                        # For fasta file in the binned contigs directory, copy fasta directly to scratch
                        # New path: scratch/Bin#.fasta
                        fasta_path = os.path.join(self.scratch, fasta_file)
                        copyfile(os.path.join(bin_file_dir, fasta_file),
                                 fasta_path)
                        fasta_paths.append(fasta_path)
                # Input data into object dict
                self.add_to_dict(ref, {'paths': fasta_paths, 'type': obj_type})

            # Catch MetagenomeUtil Error
            except _MGUError as mgue:
                self.log('Logging exception loading binned contigs to file.')
                self.log(str(mgue))
                raise

        if 'KBaseMetagenomes.AnnotatedMetagenomeAssembly' in obj_type:
            ret = self.ws.get_objects2(
                {'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref']
                }]})['data'][0]
            assembly_ref = ret['data']['assembly_ref']
            assembly_obj_type = self.ws.get_object_info3(
                {'objects': [{
                    'ref': assembly_ref
                }]})['infos'][0][2]
            self.assembly_obj_to_fasta(assembly_ref,
                                       assembly_obj_type,
                                       input_ref=ref,
                                       input_type=obj_type)

    def type_to_fasta(self, ref_lst):
        """type_to_fasta takes in a list of KBase objects references. The object type of each reference
        is checked in functions: assembly_obj_to_fasta, metagenome_obj_to_fasta, and genome_obj_to_fasta. Depending
        on the type of KBase object input a fasta file is made through one of the functions mentioned above
        and a fasta object dictionary is created with structure: {ref: {'path' : fasta_paths, 'type': object type} }

        for objects of type AssemblySet and GenomeSet a parent ref key-value pair is added such that the structure is:
        {ref: {'path' : fasta_paths, 'type': object type, 'parent_refs': [ref]} }

        for objects of type KBaseMetagenomes.BinnedContigs a unique fasta path is made for each bin in binnedContigs
        Thus the output structure is: {ref: {'paths' : [fasta_contigbin1, fasta_contigbin2], 'type': object type} }

        where the key 'paths' points to an array of fasta paths for each contig bin in ascending order. """

        # Get type info for each ref in ref_lst
        for idx, ref in enumerate(ref_lst):

            # Get KBase object type with get_object_info3
            obj_info = self.ws.get_object_info3({"objects": [{"ref": ref}]})
            obj_type = obj_info["infos"][0][2]
            # Put object in object specific fasta dictionary by type
            self.genome_obj_to_fasta(ref, obj_type)
            self.assembly_obj_to_fasta(ref, obj_type)
            self.metagenome_obj_to_fasta(ref, obj_type)
            # Append all individual object dictionaries to complete fasta dictionary for reference list

        return self.fasta_dict
Ejemplo n.º 12
0
class MaxBinUtil:
    MAXBIN_TOOLKIT_PATH = '/kb/deployment/bin/MaxBin'

    def _validate_run_maxbin_params(self, params):
        """
        _validate_run_maxbin_params:
                validates params passed to run_maxbin method

        """
        log('Start validating run_maxbin params')

        # check for required parameters
        for p in [
                'assembly_ref', 'binned_contig_name', 'workspace_name',
                'reads_list'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed commend:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running commend:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _stage_reads_list_file(self, reads_list):
        """
        _stage_reads_list_file: download fastq file associated to reads to scratch area
                          and write result_file_path to file
        """

        log('Processing reads object list: {}'.format(reads_list))

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        result_file = os.path.join(result_directory, 'reads_list_file.txt')

        result_file_path = []

        reads = self.ru.download_reads({
            'read_libraries': reads_list,
            'interleaved': 'true'
        })['files']

        for read_obj in reads_list:
            files = reads[read_obj]['files']
            result_file_path.append(files['fwd'])
            if 'rev' in files and files['rev'] is not None:
                result_file_path.append(files['rev'])

        log('Saving reads file path(s) to: {}'.format(result_file))
        with open(result_file, 'w') as file_handler:
            for item in result_file_path:
                file_handler.write("{}\n".format(item))

        return result_file

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contif file from GenomeAssembly object
        """

        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        return contig_file

    def _generate_command(self, params):
        """
        _generate_command: generate run_MaxBin.pl params
        """

        command = self.MAXBIN_TOOLKIT_PATH + '/run_MaxBin.pl '

        command += '-contig {} -out {} '.format(params.get('contig_file_path'),
                                                params.get('out_header'))

        if params.get('abund_list_file'):
            command += '-abund_list {} '.format(params.get('abund_list_file'))

        if params.get('reads_list_file'):
            command += '-reads_list {} '.format(params.get('reads_list_file'))

        if params.get('thread'):
            command += '-thread {} '.format(params.get('thread'))

        if params.get('prob_threshold'):
            command += '-prob_threshold {} '.format(
                params.get('prob_threshold'))

        if params.get('markerset'):
            command += '-markerset {} '.format(params.get('markerset'))

        if params.get('min_contig_length'):
            command += '-min_contig_length {} '.format(
                params.get('min_contig_length'))

        if params.get('plotmarker'):
            command += '-plotmarker '

        if params.get('reassembly'):
            command += '-reassembly '

        log('Generated run_MaxBin command: {}'.format(command))

        return command

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'maxbin_result.zip')
        report_file = None

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.fasta')
                            or file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), file)
                    if file.endswith('.marker.pdf'):
                        report_file = os.path.join(root, file)

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by MaxBin2 App'
        })

        if report_file:
            output_files.append({
                'path':
                report_file,
                'name':
                os.path.basename(report_file),
                'label':
                os.path.basename(report_file),
                'description':
                'Visualization of the marker by MaxBin2 App'
            })

        return output_files

    def _generate_html_report(self, result_directory, assembly_ref,
                              binned_contig_obj_ref, header):
        """
        _generate_html_report: generate html summary report
        """

        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        Overview_Content = ''

        (binned_contig_count, input_contig_count, too_short_count,
         total_bins_count, total_binned_contig_len, totoal_contig_length,
         total_short_contig_len) = self._generate_overview_info(
             assembly_ref, binned_contig_obj_ref, result_directory)
        Overview_Content += '<p>Bins: {}</p>'.format(total_bins_count)
        Overview_Content += '<p>Input Contigs: {}</p>'.format(
            input_contig_count)
        Overview_Content += '<p>Binned Contigs: {} ({:.1%})</p>'.format(
            binned_contig_count,
            binned_contig_count / float(input_contig_count))
        Overview_Content += '<p>Unbinned Contigs: {} ({:.1%})</p>'.format(
            input_contig_count - binned_contig_count,
            1 - binned_contig_count / float(input_contig_count))
        Overview_Content += '<p>Contigs Too Short: {} ({:.1%})</p>'.format(
            too_short_count, too_short_count / float(input_contig_count))
        Overview_Content += '<p>Summed Length of Binned Contigs: {} ({:.1%})</p>'.format(
            total_binned_contig_len,
            total_binned_contig_len / float(totoal_contig_length))
        Overview_Content += '<p>Summed Length of Unbinned Contigs: {} ({:.1%})</p>'.format(
            totoal_contig_length - total_binned_contig_len,
            1 - total_binned_contig_len / float(totoal_contig_length))
        Overview_Content += '<p>Summed Length of Short Contigs: {} ({:.1%})</p>'.format(
            total_short_contig_len,
            total_short_contig_len / float(totoal_contig_length))

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path': result_file_path,
            'name': os.path.basename(result_file_path),
            'label': os.path.basename(result_file_path),
            'description': 'HTML summary report for MaxBin2 App'
        })
        return html_report

    def _generate_overview_info(self, assembly_ref, binned_contig_obj_ref,
                                result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        assembly = self.dfu.get_objects({'object_refs':
                                         [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects(
            {'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')

        totoal_contig_length = 0
        for contig_id, contig in assembly.get('data').get('contigs').items():
            totoal_contig_length += int(contig.get('length'))

        binned_contig_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_binned_contig_len = binned_contig.get('data').get(
            'total_contig_len')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        too_short_count = 0
        total_short_contig_len = 0
        result_files = os.listdir(result_directory)
        for file_name in result_files:
            if file_name.endswith('.tooshort'):
                for record in SeqIO.parse(
                        os.path.join(result_directory, file_name), "fasta"):
                    total_short_contig_len += len(str(record.seq))
                    too_short_count += 1

        return (binned_contig_count, input_contig_count, too_short_count,
                total_bins_count, total_binned_contig_len,
                totoal_contig_length, total_short_contig_len)

    def _generate_report(self, binned_contig_obj_ref, result_directory,
                         params):
        """
        generate_report: generate summary report

        """
        log('Generating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, params.get('assembly_ref'),
            binned_contig_obj_ref, params.get('out_header'))

        created_objects = []
        created_objects.append({
            "ref": binned_contig_obj_ref,
            "description": "BinnedContigs from MaxBin2"
        })

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': created_objects,
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 266,
            'report_object_name': 'kb_maxbin_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def run_maxbin(self, params):
        """
        run_maxbin: run_MaxBin.pl app

        required params:
            assembly_ref: Metagenome assembly object reference
            binned_contig_name: BinnedContig object name and output file header
            workspace_name: the name of the workspace it gets saved to.
            reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary)
                        upon which MaxBin will be run

        optional params:
            thread: number of threads; default 1
            reassembly: specify this option if you want to reassemble the bins.
                        note that at least one reads file needs to be designated.
            prob_threshold: minimum probability for EM algorithm; default 0.8
            markerset: choose between 107 marker genes by default or 40 marker genes
            min_contig_length: minimum contig length; default 1000
            plotmarker: specify this option if you want to plot the markers in each contig

        ref: http://downloads.jbei.org/data/microbial_communities/MaxBin/README.txt
        """
        log('--->\nrunning MaxBinUtil.run_maxbin\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_maxbin_params(params)
        params['out_header'] = 'Bin'

        contig_file = self._get_contig_file(params.get('assembly_ref'))
        params['contig_file_path'] = contig_file

        reads_list_file = self._stage_reads_list_file(params.get('reads_list'))
        params['reads_list_file'] = reads_list_file

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        command = self._generate_command(params)

        cwd = os.getcwd()
        log('changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)
        self._run_command(command)
        os.chdir(cwd)
        log('changing working dir to {}'.format(cwd))

        log('Saved result files to: {}'.format(result_directory))
        log('Generated files:\n{}'.format('\n'.join(
            os.listdir(result_directory))))

        generate_binned_contig_param = {
            'file_directory': result_directory,
            'assembly_ref': params.get('assembly_ref'),
            'binned_contig_name': params.get('binned_contig_name'),
            'workspace_name': params.get('workspace_name')
        }
        binned_contig_obj_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        reportVal = self._generate_report(binned_contig_obj_ref,
                                          result_directory, params)

        returnVal = {
            'result_directory': result_directory,
            'binned_contig_obj_ref': binned_contig_obj_ref
        }

        returnVal.update(reportVal)

        return returnVal
Ejemplo n.º 13
0
def get_mgu():
    mgu = MetagenomeUtils(os.environ['SDK_CALLBACK_URL'])
    return mgu
Ejemplo n.º 14
0
    def test_fractiontate_contigs_ASSEMBLY_BINNEDCONTIGS_08(self):
        method = 'fractionate_contigs_pos_filter_ASSEMBLY_BINNEDCONTIGS_08'

        print("\n\nRUNNING: test_" + method + "()")
        print("==========================================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        try:
            mguClient = MetagenomeUtils(self.callback_url,
                                        token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate mguClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        base_1 = 'assembly_1plus2'
        base_2 = 'assembly'
        dir_2 = 'binned_contigs'
        type_1 = 'Assembly'
        type_2 = 'BinnedContigs'

        ass_file_1_fa = base_1 + '.fa.gz'
        ass_path_1_fa = os.path.join(self.scratch, ass_file_1_fa)
        dir_2_path = os.path.join(self.scratch, dir_2)
        shutil.copy(os.path.join("data", ass_file_1_fa), ass_path_1_fa)
        shutil.copytree(os.path.join("data", dir_2), dir_2_path)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1_fa
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            base_1 + '.' + type_1
        })
        binned_contigs_ref_2 = mguClient.file_to_binned_contigs({
            'file_directory':
            dir_2_path,
            'workspace_name':
            self.getWsName(),
            'assembly_ref':
            ass_ref_1,
            'binned_contig_name':
            base_2 + '.' + type_2
        })['binned_contig_obj_ref']

        # run method
        base_output_name = method + '_output'
        fractionate_mode = 'neg'
        params = {
            'workspace_name':
            self.getWsName(),
            'input_assembly_ref':
            ass_ref_1,
            'input_pos_filter_obj_refs': [binned_contigs_ref_2],
            'fractionate_mode':
            fractionate_mode,
            'output_name':
            'test_fractionated' + '-' + base_1 + '.' + type_1 + '-' +
            'binned_contigs_2a2b' + '-' + fractionate_mode
        }
        result = self.getImpl().run_fractionate_contigs(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
Ejemplo n.º 15
0
 def __init__(self, callback_url, scratch, ws_url):
     self.ws_url = ws_url
     self.callback_url = callback_url
     self.scratch = scratch
     self.dfu = DataFileUtil(callback_url)
     self.mgu = MetagenomeUtils(callback_url)
Ejemplo n.º 16
0
def load_fastas(config, scratch, upa):
    '''
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    upa = str(obj_data['info'][6]) + '/' + str(
        obj_data['info'][0]) + '/' + str(obj_data['info'][4])
    obj_type = obj_data['info'][2]

    id_to_assy_info = {}
    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({
            "ref": upa,
            'filename': upa_to_path(scratch, upa)
        })
        return {file_safe_upa(upa): faf}
    elif "KBaseSets.AssemblySet" in obj_type:
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({
                "ref":
                upa + ';' + item_upa['ref'],
                'filename':
                upa_to_path(scratch, item_upa['ref'])
            })
            id_to_assy_info[file_safe_upa(item_upa['ref'])] = faf
        return id_to_assy_info
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        return handle_binned_contigs(upa, mgu, scratch)

    for genome_upa in upas:
        # this could be sped up by batching the get_objects call
        # does assy file util not take bulk calls?
        # maybe doesn't matter since Shock doesn't handle bulk calls
        if upa != genome_upa:  # for single genomes, upa and genome_upa will be the same
            genome_upa = upa + ';' + genome_upa
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        target_upa = genome_data.get('contigset_ref') or genome_data.get(
            'assembly_ref')
        assembly_upa = genome_upa + ';' + target_upa
        faf = au.get_assembly_as_fasta({
            'ref':
            assembly_upa,
            'filename':
            upa_to_path(scratch, target_upa)
        })
        id_to_assy_info[file_safe_upa(target_upa)] = faf

    return id_to_assy_info
Ejemplo n.º 17
0
class VirSorterUtils:
    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.mgu = MetagenomeUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.ws = Workspace(config['workspace-url'], token=config['token'])

    def VirSorter_help(self):
        command = 'wrapper_phage_contigs_sorter_iPlant.pl --help'
        self._run_command(command)

    def get_fasta(self, ref):
        # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0
        obj_type = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0][2]
        if 'assembly' in obj_type.lower():
            genome_ref = ref
        elif 'kbasegenomes' in obj_type.lower():
            data = self.ws.get_objects2({
                'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref'],
                    'strict_maps': 1
                }]
            })['data'][0]['data']
            genome_ref = data['assembly_ref']
        else:
            raise ValueError(
                f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or "
                f"KBaseGenomeAnnotations.Assembly required.")
        return self.au.get_assembly_as_fasta({'ref': genome_ref})['path']

    def run_VirSorter(self, params):

        params['SDK_CALLBACK_URL'] = self.callback_url
        params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']

        # Get contigs from 'assembly'
        genome_fp = self.get_fasta(params['genomes'])

        command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data'

        # Add in first args
        command += f' -f {genome_fp} --db {params["database"]}'

        # Check if additional genomes were submitted
        if params.get('add_genomes'):
            add_genomes_fp = self.get_fasta(params['add_genomes'])
            print(f'Added genomes DETECTED: {add_genomes_fp}')
            command += f' --cp {add_genomes_fp}'

        bool_args = ['virome', 'diamond', 'keep_db',
                     'no_c']  # keep_db = keep-db

        for bool_arg in bool_args:
            if params[
                    bool_arg] == 1:  # 0 is true and therefore run... though for some reason it's reversed on json
                if bool_arg == 'keep_db':
                    bool_arg = 'keep-db'

                command += f' --{bool_arg}'

        self._run_command(command)

        report = self._generate_report(
            params)  # Basically, do everything that's after the tool runs

        return report

    def _run_command(self, command):
        """

        :param command:
        :return:
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, err = pipe.communicate()
        exitCode = pipe.returncode

        if exitCode == 0:
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format(
                exitCode, output, err)
            raise RuntimeError(error_msg)

    def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id):
        columns = [
            'Contig_id',
            'Nb genes contigs',
            'Fragment',
            'Nb genes',
            'Category',
            'Nb phage hallmark genes',
            'Phage gene enrichment sig',
            'Non-Caudovirales phage gene enrichment sig',
            'Pfam depletion sig',
            'Uncharacterized enrichment sig',
            'Strand switch depletion sig',
            'Short genes enrichment sig',
        ]

        try:
            with open(virsorter_global_fp, 'r') as vir_fh:
                data = {}
                category = ''
                for line in vir_fh:
                    if line.startswith('## Contig_id'):
                        continue
                    elif line.startswith(
                            '## '
                    ):  # If 'header' lines are consumed by 1st if, then remaining should be good
                        category = line.split('## ')[-1].split(' -')[0]
                    else:
                        values = line.strip().split(',')
                        data[values[0]] = dict(zip(columns[1:], values[1:]))
        except:
            vir_path = os.path.join(os.getcwd(), 'virsorter-out')
            files = os.listdir(vir_path)
            raise RuntimeError(
                f"{virsorter_global_fp} is not a file. existing files {files}."
            )

        df = pd.DataFrame().from_dict(data, orient='index')
        df.index.name = columns[0]
        df.reset_index(inplace=True)

        html = df.to_html(index=False,
                          classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(
            html_table=html, affi_contigs_shock_id=affi_contigs_shock_id)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(
                ' style="text-align: right;"', '').replace(
                    'thead>', 'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos +
                                     8] + '\n' + new_text + direct_html[
                                         start_pos + 8:]

        return final_html

    def get_assembly_contig_ids(self, assembly_ref):
        """get contig ids from assembly_ref"""
        contigs = self.ws.get_objects2(
            {'objects': [{
                'ref': assembly_ref,
                'included': ['contigs']
            }]})['data'][0]['data']['contigs']
        return contigs.keys()

    def _generate_report(self, params):
        """

        :param params:
        :return:
        """

        # Get URL
        self.dfu = dfu(params['SDK_CALLBACK_URL'])

        # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location
        virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out')

        print(
            f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}'
        )

        # Replacing individual download files with BinnedContigs

        # kb_deseq adds output files, then builds report files and sends all of them to the workspace
        output_files = []  # Appended list of dicts containing attributes

        # Collect all the files needed to report to end-user
        # Get all predicted viral sequences
        pred_fnas = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.fasta'))
        pred_gbs = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.gb'))
        # Summary 'table'
        glob_signal = os.path.join(virsorter_outdir,
                                   'VIRSorter_global-phage-signal.csv')

        print('Identified the following predicted viral sequences:\n{}'.format(
            '\n\t'.join(pred_fnas)))

        if len(pred_fnas) == 0:
            print(
                f"Unable to find predicted viral sequences, here are the directory's content:\n"
                f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}"
            )

        if os.path.exists(glob_signal):

            print(f'Identified the global phage signal: {glob_signal}')

            lines = -1  # Don't count header
            with open(glob_signal) as fh:
                for ln in fh:
                    lines += 1

            if lines == 0:
                print('But it is EMPTY!')

        else:
            print(
                'Unable to find the global phage signal file. Was there an error during the run?'
            )

        # Append error and out files from VIRSorter
        err_fp = os.path.join(virsorter_outdir, 'logs/err')
        # if os.path.exists(err_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/err'),
        #         'name': 'VIRSorter_err',
        #         'label': 'VIRSorter_err',
        #         'description': 'VIRSorter error log file, generated from the tool itself.'
        #     })
        out_fp = os.path.join(virsorter_outdir, 'logs/out')
        # if os.path.exists(out_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/out'),
        #         'name': 'VIRSorter_out',
        #         'label': 'VIRSorter_out',
        #         'description': 'VIRSorter output log file, generated from the tool itself.'
        #     })

        if not (os.path.exists(err_fp) or os.path.exists(out_fp)):
            print(
                'Unable to find err and/or out files in LOG directory, contents:'
            )
            print(os.listdir(os.path.join(virsorter_outdir, 'logs')))

        # Make output directory
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)

        # Deal with nucleotide and protein fasta
        pred_fna_tgz_fp = os.path.join(output_dir,
                                       'VIRSorter_predicted_viral_fna.tar.gz')
        with tarfile.open(
                pred_fna_tgz_fp,
                'w:gz') as pred_fna_tgz_fh:  # Compress to minimize disk usage
            for pred_fna in pred_fnas:
                pred_fna_tgz_fh.add(pred_fna,
                                    arcname=os.path.basename(pred_fna))
        output_files.append({
            'path':
            pred_fna_tgz_fp,
            'name':
            os.path.basename(pred_fna_tgz_fp),
            'label':
            os.path.basename(pred_fna_tgz_fp),
            'description':
            'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_fna_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in FASTA format: '
                f'{pred_fna_tgz_fp}')

        pred_gb_tgz_fp = os.path.join(output_dir,
                                      'VIRSorter_predicted_viral_gb.tar.gz')
        with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh:
            for pred_gb in pred_gbs:
                pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb))
        output_files.append({
            'path':
            pred_gb_tgz_fp,
            'name':
            os.path.basename(pred_gb_tgz_fp),
            'label':
            os.path.basename(pred_gb_tgz_fp),
            'description':
            'Genbank-formatted sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_gb_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in Genbank format: '
                f'{pred_gb_tgz_fp}')

        # To create BinnedContig, need to create another directory with each of the "bins" as separate files?
        binned_contig_output_dir = os.path.join(self.scratch,
                                                str(uuid.uuid4()))
        self._mkdir_p(binned_contig_output_dir)

        # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage
        # of its features, but also to feed more easily into other tools (e.g. vConTACT)
        created_objects = []  # Will store the objects that go to the workspace

        # load contig ids from the assembly input
        # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref)
        assembly_contig_ids = self.get_assembly_contig_ids(
            params['genomes'])  # Will fail for Genome

        summary_fp = os.path.join(
            binned_contig_output_dir,
            'VIRSorter.summary')  # Anything that ends in .summary
        with open(summary_fp, 'w') as summary_fh:

            summary_writer = csv.writer(summary_fh,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_MINIMAL)
            summary_writer.writerow(
                ['Bin name', 'Completeness', 'Genome size', 'GC content'])

            for category_fp in pred_fnas:
                # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention
                category = os.path.basename(category_fp).split(
                    'cat-')[-1].split('.')[0]
                dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3))
                dest_fp = os.path.join(output_dir, dest_fn)
                binned_contig_fp = os.path.join(binned_contig_output_dir,
                                                dest_fn)

                genome_size = 0
                gc_content = []

                # Need stats for summary file
                # Also need to adjust sequence name so binnedContig object can retrieve sequences
                adjusted_sequences = []
                with open(category_fp, 'rU') as category_fh:
                    for record in SeqIO.parse(category_fh, 'fasta'):
                        seq = record.seq
                        gc_content.append(SeqUtils.GC(seq))
                        genome_size += len(seq)

                        # This is very dirty, but need to change name to match original contigs
                        record.id = record.id.replace('VIRSorter_',
                                                      '').replace(
                                                          '-circular',
                                                          '').split('-cat_')[0]
                        if 'gene' in record.id:  # Prophage
                            record.id = record.id.split('_gene')[0]
                        record.id = record.id.rsplit('_', 1)[0]

                        # here we make sure that the id's line up with contig ids in the input assembly object
                        if record.id not in assembly_contig_ids:
                            for assembly_contig_id in assembly_contig_ids:
                                # first check if record.id is substring of current contig id,
                                # then check if current contig id is substring of record.id
                                # NOTE: this is not a perfect way of checking and will likely
                                #       fail in some circumstances.
                                #       A more complete check would be to make sure there is a 1:1
                                #       mapping of contig id's in the assembly object as compared to
                                #       the binned contig object (the fasta files defined here).
                                if (record.id in assembly_contig_id) or (
                                        assembly_contig_id in record.id):
                                    record.id = assembly_contig_id
                                    break

                        record.description = ''
                        record.name = ''
                        adjusted_sequences.append(record)

                if genome_size != 0:  # Empty file

                    summary_writer.writerow([
                        dest_fn, '100%', genome_size,
                        (sum(gc_content) / len(gc_content))
                    ])

                    print('Copying {} to results directory'.format(
                        os.path.basename(category_fp)))
                    # Yes, need both. One is to get file_links in report. Second is for binnedContigs object
                    shutil.copyfile(category_fp, dest_fp)

                    # Write renamed sequences
                    with open(binned_contig_fp, 'w') as binned_contig_fh:
                        SeqIO.write(adjusted_sequences, binned_contig_fh,
                                    'fasta')

                    result = self.au.save_assembly_from_fasta({
                        'file': {
                            'path': dest_fp
                        },
                        'workspace_name':
                        params['workspace_name'],
                        'assembly_name':
                        'VirSorter-Category-{}'.format(category)
                    })

                    created_objects.append({
                        "ref":
                        result,
                        "description":
                        "KBase Assembly object from VIRSorter"
                    })

        # Create BinnedContigs object, but 1st, a little metadata
        generate_binned_contig_param = {
            'file_directory': binned_contig_output_dir,
            'assembly_ref':
            params['genomes'],  # params.get('genomes'), self.assembly_ref
            'binned_contig_name': params['binned_contig_name'],
            'workspace_name': params['workspace_name']
        }
        binned_contig_object_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        # Add binned contigs reference here, as it was already created above
        created_objects.append({
            "ref": binned_contig_object_ref,
            "description": "BinnedContigs from VIRSorter"
        })

        # Save VIRSorter_affi-contigs.tab for DRAM-v
        affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files',
                                       'VIRSorter_affi-contigs.tab')
        affi_contigs_shock_id = self.dfu.file_to_shock(
            {'file_path': affi_contigs_fp})['shock_id']

        # Use global signal (i.e. summary) file and create HTML-formatted version
        raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id)

        html_fp = os.path.join(output_dir, 'index.html')

        with open(html_fp, 'w') as html_fh:
            html_fh.write(raw_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(html_fp),
            'label':
            os.path.basename(html_fp),
            'description':
            'HTML summary report for VIRSorter-predicted viral genomes.'
        }]

        report_params = {
            'message':
            'Here are the results from your VIRSorter run. Above, you\'ll find a report with '
            'all the identified (putative) viral genomes, and below, links to the report as '
            'well as files generated.',
            'workspace_name':
            params['workspace_name'],
            'html_links':
            html_report,
            'direct_html_link_index':
            0,
            'report_object_name':
            'VIRSorter_report_{}'.format(str(uuid.uuid4())),
            'file_links':
            output_files,
            'objects_created':
            created_objects,
        }

        kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'],
                                          token=params['KB_AUTH_TOKEN'])
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref'],
            'result_directory': binned_contig_output_dir,
            'binned_contig_obj_ref': binned_contig_object_ref
        }

        return report_output

    def _mkdir_p(self, path):
        """
        :param path:
        :return:
        """

        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Ejemplo n.º 18
0
class DASToolUtil:
    DASTOOL_THREADS = 2
    BINNER_RESULT_DIRECTORY = 'das_tool_output_dir'
    BINNER_BIN_RESULT_DIR = 'das_tool_output_dir_DASTool_bins'

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def validate_run_das_tool_params(self, params):
        """
        validate_run_concoct_params:
                validates params passed to run_concoct method

        """
        log('Start validating run_kb_das_tool params')

        # check for required parameters
        for p in [
                'assembly_ref', 'input_binned_contig_names',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def mkdir_p(self, path):
        """
        mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def run_command(self, command):
        """
        run_command: run command and print result
        """
        #os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(
                exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    def get_contig_file(self, assembly_ref):
        """
        get_contig_file: get contif file from GenomeAssembly object
        """

        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        return contig_file

    def retrieve_and_clean_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self.get_contig_file(task_params['assembly_ref'])

        # remove spaces from fasta headers because that breaks bedtools
        assembly_clean = os.path.abspath(assembly).split(
            '.fa')[0] + "_clean.fa"

        command = '/bin/bash reformat.sh in={} out={} addunderscore'.format(
            assembly, assembly_clean)

        log('running reformat command: {}'.format(command))
        out, err = self.run_command(command)

        return assembly_clean

    def generate_output_file_list(self, result_directory):
        """
        generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self.mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'das_tool_result.zip')
        report_file = None

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:

            # grab all files we want to zip
            for dirname, subdirs, files in os.walk(result_directory):
                for file in files:
                    if (file.endswith('.sam') or file.endswith('.bam')
                            or file.endswith('.bai')
                            or file.endswith('.summary')):
                        continue
                    if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                        continue
                    zip_file.write(os.path.join(dirname, file), file)
                if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                    baseDir = os.path.basename(dirname)
                    for file in files:
                        full = os.path.join(dirname, file)
                        zip_file.write(full, os.path.join(baseDir, file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'Files generated by kb_das_tool App'
        })

        return output_files

    def generate_html_report(self, result_directory, assembly_ref,
                             binned_contig_obj_ref):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        #html_report = list()

        output_directory = os.path.join(self.scratch,
                                        'html_dir_' + str(uuid.uuid4()))
        self.mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''
        (binned_contig_count, input_contig_count,
         total_bins_count) = self.generate_overview_info(
             assembly_ref, binned_contig_obj_ref, result_directory)

        # get pdfs
        pdf_filename_l = [
            f for f in os.listdir(self.BINNER_RESULT_DIRECTORY)
            if f.endswith('.pdf')
        ]
        assert len(pdf_filename_l) == 2

        Overview_Content += '<p>Binned contigs: {}</p>'.format(
            binned_contig_count)
        Overview_Content += '<p>Input contigs: {}</p>'.format(
            input_contig_count)
        Overview_Content += '<p>Number of bins: {}</p>'.format(
            total_bins_count)
        for pdf_filename in pdf_filename_l:
            Overview_Content += '\n<embed src="{}" width="1000px" height="700px">'.format(
                pdf_filename)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                report_template = report_template.replace(
                    'Summary_Table_Content', Summary_Table_Content)
                result_file.write(report_template)

        # copy pdfs into html dir
        for pdf_filename in pdf_filename_l:
            shutil.copyfile(
                os.path.join(self.BINNER_RESULT_DIRECTORY, pdf_filename),
                os.path.join(output_directory, pdf_filename))

        # save html dir to shock
        def dir_to_shock(dir_path, name, description):
            '''
            For regular directories or html directories

            name - for regular directories: the name of the flat (zip) file returned to ui
                   for html directories: the name of the html file
            '''
            dfu_fileToShock_ret = self.dfu.file_to_shock({
                'file_path': dir_path,
                'make_handle': 0,
                'pack': 'zip',
            })

            dir_shockInfo = {
                'shock_id': dfu_fileToShock_ret['shock_id'],
                'name': name,
                'description': description
            }

            return dir_shockInfo

        html_shockInfo = dir_to_shock(output_directory, 'report.html',
                                      'Report html for DAS tool')
        """
        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_concoct App'})

        return html_report
        """

        return [html_shockInfo]

    def generate_overview_info(self, assembly_ref, binned_contig_obj_ref,
                               result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        # get assembly and binned_contig objects that already have some data populated in them
        assembly = self.dfu.get_objects({'object_refs':
                                         [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects(
            {'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')
        bins_directory = os.path.join(self.scratch, result_directory,
                                      self.BINNER_BIN_RESULT_DIR)
        binned_contig_count = 0
        total_bins_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        return (binned_contig_count, input_contig_count, total_bins_count)

    def generate_report(self, binned_contig_obj_ref, params):
        """
        generate_report: generate summary report

        """
        log('Generating report')
        params['result_directory'] = self.BINNER_RESULT_DIRECTORY

        output_files = self.generate_output_file_list(
            params['result_directory'])

        output_html_files = self.generate_html_report(
            params['result_directory'], params['assembly_ref'],
            binned_contig_obj_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 500,
            'report_object_name': 'kb_das_tool_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def rename_and_standardize_bin_names(self):
        """
        generate_command: generate renamed bins
        """
        log("\n\nRunning rename_and_standardize_bin_names")
        i = 0
        path_to_result_bins = os.path.join(self.scratch,
                                           self.BINNER_RESULT_DIRECTORY,
                                           "das_tool_output_dir_DASTool_bins")
        path_to_das_tool_key = os.path.abspath(
            path_to_result_bins) + '/das_tool_name_key.tsv'
        with open(path_to_das_tool_key, 'w+') as f:
            f.write("Original.Bin.Name\tRenamed.Bin.Name\n")
            for dirname, subdirs, files in os.walk(path_to_result_bins):
                for file in files:
                    if file.endswith('.fa'):
                        i += 1
                        os.rename(
                            os.path.abspath(path_to_result_bins) + '/' + file,
                            os.path.abspath(path_to_result_bins) + '/bin.' +
                            str(i).zfill(3) +
                            '.fasta')  # need to change to 4 digits
                        f.write(file + '\tbin.' + str(i).zfill(3) + '.fasta\n')

    def make_binned_contig_summary_file_for_binning_apps(self, task_params):
        """
        generate_command: generate binned contig summary command
        """
        log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
        result_directory = task_params['result_directory']
        path_to_result_bins = '{}/{}/'.format(
            result_directory, task_params['bin_result_directory'])
        path_to_summary_file = path_to_result_bins + 'binned_contig.summary'
        with open(path_to_summary_file, 'w+') as f:
            f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
            for dirname, subdirs, files in os.walk(path_to_result_bins):
                for file in files:
                    if file.endswith('.fasta'):
                        genome_bin_fna_file = os.path.join(
                            path_to_result_bins, file)
                        bbstats_output_file = os.path.join(
                            self.scratch, self.BINNER_RESULT_DIRECTORY,
                            genome_bin_fna_file).split(
                                '.fasta')[0] + ".bbstatsout"
                        bbstats_output = self.generate_stats_for_genome_bins(
                            task_params, genome_bin_fna_file,
                            bbstats_output_file)
                        f.write('{}\t0\t{}\t{}\n'.format(
                            genome_bin_fna_file.split("/")[-1],
                            bbstats_output['contig_bp'],
                            bbstats_output['gc_avg']))
        log('Finished make_binned_contig_summary_file_for_binning_apps function'
            )

    #
    # def make_binned_contig_summary_file_for_binning_apps(self, task_params):
    #     """
    #     generate_command: generate binned contig summary command
    #     """
    #     log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
    #     path_to_result = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins")
    #     path_to_summary_file = path_to_result + '/binned_contig.summary'
    #     with open(path_to_summary_file, 'w+') as f:
    #         f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
    #         for dirname, subdirs, files in os.walk(path_to_result):
    #             for file in files:
    #                 if file.endswith('.fasta'):
    #                     genome_bin_fna_file = os.path.join(path_to_result, file)
    #                     bbstats_output_file = os.path.join(path_to_result,
    #                                                        genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout"
    #                     bbstats_output = self.generate_stats_for_genome_bins(task_params,
    #                                                                          genome_bin_fna_file,
    #                                                                          bbstats_output_file)
    #                     f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1],
    #                                                      bbstats_output['contig_bp'],
    #                                                      bbstats_output['gc_avg']))
    #     f.close()
    #     log('Finished make_binned_contig_summary_file_for_binning_apps function')
    #

    def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file,
                                       bbstats_output_file):
        """
        generate_command: bbtools stats.sh command
        """
        log("running generate_stats_for_genome_bins on {}".format(
            genome_bin_fna_file))
        genome_bin_fna_file = os.path.join(self.scratch,
                                           self.BINNER_RESULT_DIRECTORY,
                                           genome_bin_fna_file)
        command = '/bin/bash stats.sh in={} format=3 > {}'.format(
            genome_bin_fna_file, bbstats_output_file)
        self.run_command(command)
        bbstats_output = open(bbstats_output_file, 'r').readlines()[1]
        n_scaffolds = bbstats_output.split('\t')[0]
        n_contigs = bbstats_output.split('\t')[1]
        scaf_bp = bbstats_output.split('\t')[2]
        contig_bp = bbstats_output.split('\t')[3]
        gap_pct = bbstats_output.split('\t')[4]
        scaf_N50 = bbstats_output.split('\t')[5]
        scaf_L50 = bbstats_output.split('\t')[6]
        ctg_N50 = bbstats_output.split('\t')[7]
        ctg_L50 = bbstats_output.split('\t')[8]
        scaf_N90 = bbstats_output.split('\t')[9]
        scaf_L90 = bbstats_output.split('\t')[10]
        ctg_N90 = bbstats_output.split('\t')[11]
        ctg_L90 = bbstats_output.split('\t')[12]
        scaf_max = bbstats_output.split('\t')[13]
        ctg_max = bbstats_output.split('\t')[14]
        scaf_n_gt50K = bbstats_output.split('\t')[15]
        scaf_pct_gt50K = bbstats_output.split('\t')[16]
        gc_avg = float(bbstats_output.split('\t')
                       [17]) * 100  # need to figure out if correct
        gc_std = float(bbstats_output.split('\t')
                       [18]) * 100  # need to figure out if correct

        log('Generated generate_stats_for_genome_bins command: {}'.format(
            command))

        return {
            'n_scaffolds': n_scaffolds,
            'n_contigs': n_contigs,
            'scaf_bp': scaf_bp,
            'contig_bp': contig_bp,
            'gap_pct': gap_pct,
            'scaf_N50': scaf_N50,
            'scaf_L50': scaf_L50,
            'ctg_N50': ctg_N50,
            'ctg_L50': ctg_L50,
            'scaf_N90': scaf_N90,
            'scaf_L90': scaf_L90,
            'ctg_N90': ctg_N90,
            'ctg_L90': ctg_L90,
            'scaf_max': scaf_max,
            'ctg_max': ctg_max,
            'scaf_n_gt50K': scaf_n_gt50K,
            'scaf_pct_gt50K': scaf_pct_gt50K,
            'gc_avg': gc_avg,
            'gc_std': gc_std
        }

    def generate_das_tool_input_files_and_commands_from_binned_contigs(
            self, params):
        #params['binned_contig_list_file'] = binned_contig_list_file
        binned_contig_names = params['input_binned_contig_names']
        trimmed_binned_contig_name_list = []
        contig_to_bin_file_name_list = []
        for input_ref in binned_contig_names:
            # next line needed for testing
            # binned_contig = self.dfu.get_objects({'object_refs': [input_ref['binned_contig_obj_ref']]})['data'][0]

            # next line needed in production only
            binned_contig = self.dfu.get_objects({'object_refs':
                                                  [input_ref]})['data'][0]
            binned_contig_name = binned_contig.get('info')[1]
            binned_contig_data = binned_contig.get('data')
            bins = binned_contig_data.get('bins')
            trimmed_binned_contig_name = binned_contig_name.split(
                ".BinnedContig")[0]
            trimmed_binned_contig_name_list.append(trimmed_binned_contig_name)
            contig_to_bin_file_name = "{}_contigs_to_bins.tsv".format(
                trimmed_binned_contig_name)
            contig_to_bin_file_name_list.append(contig_to_bin_file_name)

            f = open(contig_to_bin_file_name, "w+")
            for bin in bins:
                bin_id = bin.get('bid')
                trimmed_bin_id = bin_id.split(".fasta")[0]
                contigs = bin.get('contigs')
                for contig_id, contig_value in contigs.items():
                    f.write("{}\t{}.{}\n".format(contig_id,
                                                 trimmed_binned_contig_name,
                                                 trimmed_bin_id))
            f.close()
        #contig_to_bin_file_name_list = self.BINNER_RESULT_DIRECTORY + contig_to_bin_file_name
        # temp = str(self.BINNER_RESULT_DIRECTORY) + '/'
        # contig_to_bin_file_name_list = [temp + s for s in contig_to_bin_file_name_list]

        return (trimmed_binned_contig_name_list, contig_to_bin_file_name_list)

    def generate_das_tool_command(self, params,
                                  trimmed_binned_contig_name_list,
                                  contig_to_bin_file_name_list):
        """
        generate_command: generate concoct params
        """

        print("\n\nRunning generate_das_tool_command")

        command = 'DAS_Tool '

        command += '-i {} '.format(contig_to_bin_file_name_list)
        command += '-l {} '.format(trimmed_binned_contig_name_list)
        command += '-c {} '.format(params.get('contig_file_path'))
        command += '-o {} '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--search_engine {} '.format(params.get('search_engine'))
        command += '--score_threshold {} '.format(
            params.get('score_threshold'))
        command += '--duplicate_penalty {} '.format(
            params.get('duplicate_penalty'))
        command += '--megabin_penalty {} '.format(
            params.get('megabin_penalty'))
        command += '--write_bin_evals {} '.format(
            params.get('write_bin_evals'))
        command += '--create_plots {} '.format(params.get('create_plots'))
        command += '--write_bins 1 '
        command += '--write_unbinned 0 '
        command += '-t {}'.format(self.DASTOOL_THREADS)

        log('Generated das_tool command: {}'.format(command))

        return command

    def run_das_tool(self, params):
        """
        run_das_tool: DAS_Tool app

        required params:
            assembly_ref: Metagenome assembly object reference
            input_binned_contig_names: list of BinnedContig objects
            output_binned_contig_name: output BinnedContig object name
            workspace_name: the name of the workspace it gets saved to.

        optional params:
            search_engine; default diamond
            score_threshold; default 0.5
            duplicate_penalty; default 0.6
            megabin_penalty; default 0.5
            write_bin_evals; default 1
            create_plots; default 1
            write_bins; default 1
            write_unbinned; default 0

        ref: https://github.com/cmks/DAS_Tool
        """
        log('--->\nrunning DASToolUtil.run_das_tool\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_run_das_tool_params(params)

        print("\n\nFinished running validate_run_das_tool_params")
        #
        contig_file = self.get_contig_file(params.get('assembly_ref'))
        params['contig_file_path'] = contig_file

        result_directory = os.path.join(self.scratch,
                                        self.BINNER_RESULT_DIRECTORY)
        params['result_directory'] = result_directory
        self.mkdir_p(result_directory)

        cwd = os.getcwd()
        log('Changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)

        (
            trimmed_binned_contig_name_list, contig_to_bin_file_name_list
        ) = self.generate_das_tool_input_files_and_commands_from_binned_contigs(
            params)
        comma_symbol = ','
        trimmed_binned_contig_name_list = comma_symbol.join(
            trimmed_binned_contig_name_list)
        contig_to_bin_file_name_list = comma_symbol.join(
            contig_to_bin_file_name_list)

        log(os.listdir(result_directory))
        log("trimmed_binned_contig_name_list {}".format(
            trimmed_binned_contig_name_list))
        log("contig_to_bin_file_name_list {}".format(
            contig_to_bin_file_name_list))

        # binned_contig_to_file_params = {
        #     'input_ref': input_ref['binned_contig_obj_ref'],
        #     'save_to_shock': 1,
        #     'bin_file_directory': '{}/bin_set_{}/'.format(result_directory, i),
        #     'workspace_name': params.get('workspace_name'),
        # }
        #
        # self.mgu.binned_contigs_to_file(binned_contig_to_file_params) # returns "binned_contig_obj_ref" of type "obj_ref" (An X/Y/Z style reference)

        #shutil.copytree(bin_file_directory, os.path.join(result_directory, bin_file_directory))
        #print('\n\n\n result: {}'.format(self.mgu.binned_contigs_to_file(binned_contig_to_file_params)))

        #run concoct
        command = self.generate_das_tool_command(
            params, trimmed_binned_contig_name_list,
            contig_to_bin_file_name_list)
        log('\nWorking dir is {}'.format(result_directory))
        log('\nWorking dir is {}'.format(os.getcwd()))
        log('Changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)
        self.run_command(command)

        os.chdir(self.scratch)

        task_params = {}
        task_params['result_directory'] = os.path.join(self.scratch)
        task_params['bin_result_directory'] = os.path.join(
            self.BINNER_RESULT_DIRECTORY, "das_tool_output_dir_DASTool_bins")

        # check to make sure bins were generated, otherwise no need to run the rest
        if not os.path.exists(task_params['bin_result_directory']):
            log('DAS_Tool did not succeed in generating a set of bins using the input bins and parameters - skipping the creation of a new BinnedContig object.'
                )
            log('Note: this result is sometimes expected using the DAS-Tool workflow; it is possible that DAS-Tool cannot optimize the input binned contigs.'
                )
            log('KBase is aware of this error!')
            log('Currently KBase manages this run instance as an error because KBase is expecting an output set of binned contigs.'
                )
            raise ValueError(
                'No bins generated - this is one of the expected results when DAS-Tool cannot optimize the input bins, and not necessarily an error. KBase is aware of the issue where DAS-Tool runs successfully but does not produce any output set of optimized bins.'
            )
        else:
            self.rename_and_standardize_bin_names()
            self.make_binned_contig_summary_file_for_binning_apps(task_params)

            generate_binned_contig_param = {
                'file_directory':
                os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY,
                             "das_tool_output_dir_DASTool_bins"),
                'assembly_ref':
                params.get('assembly_ref'),
                'binned_contig_name':
                params.get('output_binned_contig_name'),
                'workspace_name':
                params.get('workspace_name')
            }

            binned_contig_obj_ref = self.mgu.file_to_binned_contigs(
                generate_binned_contig_param).get('binned_contig_obj_ref')

            reportVal = self.generate_report(binned_contig_obj_ref, params)

            returnVal = {
                'result_directory':
                os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY),
                'binned_contig_obj_ref':
                binned_contig_obj_ref
            }

            returnVal.update(reportVal)

        return returnVal
Ejemplo n.º 19
0
 def __init__(self, config):
     self.scratch = os.path.abspath(config['scratch'])
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.mgu = MetagenomeUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.ws = Workspace(config['workspace-url'], token=config['token'])
Ejemplo n.º 20
0
class CocacolaUtil:
    CONCOCT_BASE_PATH = '/kb/deployment/bin/CONCOCT'
    COCACOLA_BASE_PATH = '/kb/module/lib/kb_cocacola/bin/COCACOLA-python'
    BINNER_RESULT_DIRECTORY = 'cocacola_output_dir'
    BINNER_BIN_RESULT_DIR = 'final_bins'
    MAPPING_THREADS = 16
    BBMAP_MEM = '30g'

    def __init__(self, config):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.mgu = MetagenomeUtils(self.callback_url)

    def _validate_run_cocacola_params(self, task_params):
        """
        _validate_run_cocacola_params:
                validates params passed to run_cocacola method
        """
        log('Start validating run_cocacola params')

        # check for required parameters
        for p in ['assembly_ref', 'binned_contig_name', 'workspace_name', 'reads_list', 'read_mapping_tool']:
            if p not in task_params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """
        os.chdir(self.scratch)
        log('Start executing command:\n{}'.format(command))
        log('Command is running from:\n{}'.format(self.scratch))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, stderr = pipe.communicate()
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\n'.format(exitCode))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nStderr:\n{}'.format(exitCode, output, stderr)
            raise ValueError(error_msg)
            sys.exit(1)
        return (output, stderr)

    # this function has been customized to return read_type variable (interleaved vs single-end library)
    def stage_reads_list_file(self, reads_list):
        """
        stage_reads_list_file: download fastq file associated to reads to scratch area
                          and return result_file_path
        """

        log('Processing reads object list: {}'.format(reads_list))

        result_file_path = []
        read_type = []

        # getting from workspace and writing to scratch. The 'reads' dictionary now has file paths to scratch.
        reads = self.ru.download_reads({'read_libraries': reads_list, 'interleaved': None})['files']

        # reads_list is the list of file paths on workspace? (i.e. 12804/1/1).
        # "reads" is the hash of hashes where key is "12804/1/1" or in this case, read_obj and
        # "files" is the secondary key. The tertiary keys are "fwd" and "rev", as well as others.
        for read_obj in reads_list:
            files = reads[read_obj]['files']    # 'files' is dictionary where 'fwd' is key of file path on scratch.
            result_file_path.append(files['fwd'])
            read_type.append(files['type'])
            if 'rev' in files and files['rev'] is not None:
                result_file_path.append(files['rev'])

        return result_file_path, read_type

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contig file from GenomeAssembly object
        """
        contig_file = self.au.get_assembly_as_fasta({'ref': assembly_ref}).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path': contig_file})['file_path']

        return contig_file

    def retrieve_and_clean_assembly(self, task_params):
        if os.path.exists(task_params['contig_file_path']):
            assembly = task_params['contig_file_path']
            print("FOUND ASSEMBLY ON LOCAL SCRATCH")
        else:
            # we are on njsw so lets copy it over to scratch
            assembly = self._get_contig_file(task_params['assembly_ref'])

        # remove spaces from fasta headers because that breaks bedtools
        assembly_clean = os.path.abspath(assembly).split('.fa')[0] + "_clean.fa"

        command = '/bin/bash reformat.sh in={} out={} addunderscore overwrite=true'.format(assembly, assembly_clean)

        log('running reformat command: {}'.format(command))
        out, err = self._run_command(command)

        return assembly_clean

    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = os.path.abspath(fasta_file_path).split('.fa')[0] + "_filtered.fa"

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def generate_stats_for_genome_bins(self, task_params, genome_bin_fna_file, bbstats_output_file):
        """
        generate_command: bbtools stats.sh command
        """
        log("running generate_stats_for_genome_bins on {}".format(genome_bin_fna_file))
        genome_bin_fna_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY, genome_bin_fna_file)
        command = '/bin/bash stats.sh in={} format=3 > {}'.format(genome_bin_fna_file, bbstats_output_file)
        self._run_command(command)
        bbstats_output = open(bbstats_output_file, 'r').readlines()[1]
        n_scaffolds = bbstats_output.split('\t')[0]
        n_contigs = bbstats_output.split('\t')[1]
        scaf_bp = bbstats_output.split('\t')[2]
        contig_bp = bbstats_output.split('\t')[3]
        gap_pct = bbstats_output.split('\t')[4]
        scaf_N50 = bbstats_output.split('\t')[5]
        scaf_L50 = bbstats_output.split('\t')[6]
        ctg_N50 = bbstats_output.split('\t')[7]
        ctg_L50 = bbstats_output.split('\t')[8]
        scaf_N90 = bbstats_output.split('\t')[9]
        scaf_L90 = bbstats_output.split('\t')[10]
        ctg_N90 = bbstats_output.split('\t')[11]
        ctg_L90 = bbstats_output.split('\t')[12]
        scaf_max = bbstats_output.split('\t')[13]
        ctg_max = bbstats_output.split('\t')[14]
        scaf_n_gt50K = bbstats_output.split('\t')[15]
        scaf_pct_gt50K = bbstats_output.split('\t')[16]
        gc_avg = float(bbstats_output.split('\t')[17]) * 100  # need to figure out if correct
        gc_std = float(bbstats_output.split('\t')[18]) * 100  # need to figure out if correct

        log('Generated generate_stats_for_genome_bins command: {}'.format(command))

        return {'n_scaffolds': n_scaffolds,
                'n_contigs': n_contigs,
                'scaf_bp': scaf_bp,
                'contig_bp': contig_bp,
                'gap_pct': gap_pct,
                'scaf_N50': scaf_N50,
                'scaf_L50': scaf_L50,
                'ctg_N50': ctg_N50,
                'ctg_L50': ctg_L50,
                'scaf_N90': scaf_N90,
                'scaf_L90': scaf_L90,
                'ctg_N90': ctg_N90,
                'ctg_L90': ctg_L90,
                'scaf_max': scaf_max,
                'ctg_max': ctg_max,
                'scaf_n_gt50K': scaf_n_gt50K,
                'scaf_pct_gt50K': scaf_pct_gt50K,
                'gc_avg': gc_avg,
                'gc_std': gc_std
                }

    def deinterlace_raw_reads(self, fastq):
        fastq_forward = fastq.split('.fastq')[0] + "_forward.fastq"
        fastq_reverse = fastq.split('.fastq')[0] + "_reverse.fastq"
        command = 'reformat.sh in={} out1={} out2={} overwrite=true'.format(fastq, fastq_forward, fastq_reverse)
        self._run_command(command)
        return (fastq_forward, fastq_reverse)

    def run_read_mapping_interleaved_pairs_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in interleaved mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=true mappedonly nodisk overwrite'
        elif task_params['read_mapping_tool'] == 'bwa':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} '.format(fastq_forward)
            command += '{} > '.format(fastq_reverse)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            (fastq_forward, fastq_reverse) = self.deinterlace_raw_reads(fastq)
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-1 {} '.format(fastq_forward)
            command += '-2 {} '.format(fastq_reverse)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def run_read_mapping_unpaired_mode(self, task_params, assembly_clean, fastq, sam):
        read_mapping_tool = task_params['read_mapping_tool']
        log("running {} mapping in single-end (unpaired) mode.".format(read_mapping_tool))
        if task_params['read_mapping_tool'] == 'bbmap':
            command = 'bbmap.sh -Xmx{} '.format(self.BBMAP_MEM)
            command += 'threads={} '.format(self.MAPPING_THREADS)
            command += 'ref={} '.format(assembly_clean)
            command += 'in={} '.format(fastq)
            command += 'out={} '.format(sam)
            command += 'fast interleaved=false mappedonly nodisk overwrite'
            # BBMap is deterministic without the deterministic flag if using single-ended reads
        elif task_params['read_mapping_tool'] == 'bwa':
            command = 'bwa index {} && '.format(assembly_clean)
            command += 'bwa mem -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_default':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'bowtie2_very_sensitive':
            bt2index = os.path.basename(assembly_clean) + '.bt2'
            command = 'bowtie2-build -f {} '.format(assembly_clean)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '{} && '.format(bt2index)
            command += 'bowtie2 --very-sensitive -x {} '.format(bt2index)
            command += '-U {} '.format(fastq)
            command += '--threads {} '.format(self.MAPPING_THREADS)
            command += '-S {}'.format(sam)
        elif task_params['read_mapping_tool'] == 'minimap2':
            command = 'minimap2 -ax sr -t {} '.format(self.MAPPING_THREADS)
            command += '{} '.format(assembly_clean)
            command += '{} > '.format(fastq)
            command += '{}'.format(sam)
        elif task_params['read_mapping_tool'] == 'hisat2':
            ht2index = os.path.basename(assembly_clean) + '.ht2'
            command = 'hisat2-build {} '.format(assembly_clean)
            command += '{} && '.format(ht2index)
            command += 'hisat2 -x {} '.format(ht2index)
            command += '-U {} '.format(fastq)
            command += '-S {} '.format(sam)
            command += '--threads {}'.format(self.MAPPING_THREADS)
        log('running alignment command: {}'.format(command))
        out, err = self._run_command(command)

    def convert_sam_to_sorted_and_indexed_bam(self, sam):
        # create bam files from sam files
        sorted_bam = os.path.abspath(sam).split('.sam')[0] + "_sorted.bam"

        command = 'samtools view -F 0x04 -uS {} | '.format(sam)
        command += 'samtools sort - -o {}'.format(sorted_bam)

        log('running samtools command to generate sorted bam: {}'.format(command))
        self._run_command(command)

        # verify we got bams
        if not os.path.exists(sorted_bam):
            log('Failed to find bam file\n{}'.format(sorted_bam))
            sys.exit(1)
        elif(os.stat(sorted_bam).st_size == 0):
            log('Bam file is empty\n{}'.format(sorted_bam))
            sys.exit(1)

        # index the bam file
        command = 'samtools index {}'.format(sorted_bam)

        log('running samtools command to index sorted bam: {}'.format(command))
        self._run_command(command)

        return sorted_bam

    def generate_alignment_bams(self, task_params, assembly_clean):
        """
            This function runs the selected read mapper and creates the
            sorted and indexed bam files from sam files using samtools.
        """

        reads_list = task_params['reads_list']

        (read_scratch_path, read_type) = self.stage_reads_list_file(reads_list)

        sorted_bam_file_list = []

        # list of reads files, can be 1 or more. assuming reads are either type unpaired or interleaved
        # will not handle unpaired forward and reverse reads input as seperate (non-interleaved) files

        for i in range(len(read_scratch_path)):
            fastq = read_scratch_path[i]
            fastq_type = read_type[i]

            sam = os.path.basename(fastq).split('.fastq')[0] + ".sam"
            sam = os.path.join(self.BINNER_RESULT_DIRECTORY, sam)

            if fastq_type == 'interleaved':  # make sure working - needs tests
                log("Running interleaved read mapping mode")
                self.run_read_mapping_interleaved_pairs_mode(task_params, assembly_clean, fastq, sam)
            else:  # running read mapping in single-end mode
                log("Running unpaired read mapping mode")
                self.run_read_mapping_unpaired_mode(task_params, assembly_clean, fastq, sam)

            sorted_bam = self.convert_sam_to_sorted_and_indexed_bam(sam)

            sorted_bam_file_list.append(sorted_bam)

        return sorted_bam_file_list

    def generate_make_coverage_table_command(self, task_params, sorted_bam_file_list):
        # create the depth file for this bam
        #
        min_contig_length = task_params['min_contig_length']
        sorted_bam = task_params['sorted_bam']

        depth_file_path = os.path.join(self.scratch, str('cocacola_depth.txt'))
        command = '/kb/module/lib/kb_cocacola/bin/jgi_summarize_bam_contig_depths '
        command += '--outputDepth {} '.format(depth_file_path)
        command += '--minContigLength {} '.format(min_contig_length)
        command += '--minContigDepth 1 {}'.format(sorted_bam)

        log('running summarize_bam_contig_depths command: {}'.format(command))
        self._run_command(command)

        return depth_file_path

    def generate_cocacola_cut_up_fasta_command(self, task_params):
        """
        generate_command: cocacola cut_up_fasta
        """
        contig_file_path = task_params['contig_file_path']
        contig_split_size = task_params['contig_split_size']
        contig_split_overlap = task_params['contig_split_overlap']

        log("\n\nRunning generate_cocacola_cut_up_fasta_command")

        command = 'python {}/scripts/cut_up_fasta.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '-c {} '.format(contig_split_size)
        command += '-o {} '.format(contig_split_overlap)
        command += '--merge_last -b temp.bed > {}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola_cut_up_fasta command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_input_table_from_bam(self, task_params):
        """
        generate_command: cocacola generate input table
        """
        log("\n\nRunning generate_cocacola_input_table_from_bam")
        command = 'python {}/scripts/gen_input_table.py '.format(self.CONCOCT_BASE_PATH)

        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/*_sorted.bam > '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{}/coverage_table.tsv'.format(self.BINNER_RESULT_DIRECTORY)
        log('Generated cocacola generate input table from bam command: {}'.format(command))
        calc_contigs = 0
        for line in open('{}/split_contigs.fa'.format(self.BINNER_RESULT_DIRECTORY)):
            if line.startswith(">"):
                calc_contigs += 1
        task_params['calc_contigs'] = calc_contigs
        self._run_command(command)

    def generate_cocacola_kmer_composition_table(self, task_params):
        """
        generate_command: cocacola generate kmer composition table
        """
        log("\n\nRunning generate_cocacola_kmer_composition_table")
        calc_contigs = task_params['calc_contigs']
        kmer_size = task_params['kmer_size']
        command = 'python {}/scripts/fasta_to_features.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '{} '.format(calc_contigs)
        command += '{} '.format(kmer_size)
        command += '{}/split_contigs_kmer_{}.csv'.format(self.BINNER_RESULT_DIRECTORY, kmer_size)
        log('Generated cocacola generate input table from bam command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_command(self, task_params):
        """
        generate_command: cocacola
        """

        min_contig_length = task_params['min_contig_length']
        kmer_size = task_params['kmer_size']

        log("\n\nRunning generate_cocacola_command")
        command = 'python {}/cocacola.py '.format(self.COCACOLA_BASE_PATH)
        command += '--contig_file {}/split_contigs.fa '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--abundance_profiles {}/coverage_table.tsv '.format(self.BINNER_RESULT_DIRECTORY)
        command += '--composition_profiles {}/split_contigs_kmer_{}.csv '.format(self.BINNER_RESULT_DIRECTORY,
                                                                                 kmer_size)
        command += '--output {}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                           min_contig_length)

        log('Generated cocacola command: {}'.format(command))

        self._run_command(command)

    def add_header_to_post_clustering_file(self, task_params):
        min_contig_length = task_params['min_contig_length']
        header = "contig_id,cluster_id"
        with open('{}/cocacola_output_clusters_min{}_headers.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                         min_contig_length), 'w') as outfile:
            outfile.write(header)
            with open('{}/cocacola_output_clusters_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY,
                                                                     min_contig_length), 'r') as datafile:
                for line in datafile:
                    outfile.write(line)

    def generate_cocacola_post_clustering_merging_command(self, task_params):
        """
        generate_command: cocacola post cluster merging
        """
        min_contig_length = task_params['min_contig_length']
        log("\n\nRunning generate_cocacola_post_clustering_merging_command")

        command = 'python {}/scripts/merge_cutup_clustering.py '.format(self.CONCOCT_BASE_PATH)
        command += '{}/cocacola_output_clusters_min{}_headers.csv > '.format(self.BINNER_RESULT_DIRECTORY,
                                                                             min_contig_length)
        command += '{}/clustering_merged_min{}.csv'.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        log('Generated generate_cocacola_post_clustering_merging command: {}'.format(command))

        self._run_command(command)

    def generate_cocacola_extract_fasta_bins_command(self, task_params):
        """
        generate_command: cocacola extract_fasta_bins
        """
        log("\n\nRunning generate_cocacola_extract_fasta_bins_command")

        contig_file_path = task_params['contig_file_path']
        min_contig_length = task_params['min_contig_length']

        bin_result_directory = self.BINNER_RESULT_DIRECTORY + '/' + self.BINNER_BIN_RESULT_DIR
        self._mkdir_p(bin_result_directory)
        command = 'python {}/scripts/extract_fasta_bins.py '.format(self.CONCOCT_BASE_PATH)
        command += '{} '.format(contig_file_path)
        command += '{}/clustering_merged_min{}.csv '.format(self.BINNER_RESULT_DIRECTORY, min_contig_length)
        command += '--output_path {}/{}'.format(self.BINNER_RESULT_DIRECTORY, self.BINNER_BIN_RESULT_DIR)
        log('Generated generate_cocacola_extract_fasta_bins_command command: {}'.format(command))

        self._run_command(command)

    def rename_and_standardize_bin_names(self, task_params):
        """
        generate_command: generate renamed bins
        """
        log("\n\nRunning rename_and_standardize_bin_names")
        path_to_cocacola_result_bins = os.path.abspath(self.BINNER_RESULT_DIRECTORY) + \
            '/' + self.BINNER_BIN_RESULT_DIR + '/'
        for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
            for file in files:
                if file.endswith('.fa'):
                    os.rename(os.path.abspath(path_to_cocacola_result_bins) + '/' +
                              file, os.path.abspath(path_to_cocacola_result_bins) + '/bin.' +
                              file.split('.fa')[0].zfill(3) + '.fasta')  # need to change to 4 digits

    def make_binned_contig_summary_file_for_binning_apps(self, task_params):
        """
        generate_command: generate binned contig summary command
        """
        log("\n\nRunning make_binned_contig_summary_file_for_binning_apps")
        path_to_cocacola_result = os.path.abspath(self.BINNER_RESULT_DIRECTORY)
        path_to_cocacola_result_bins = '{}/{}/'.format(path_to_cocacola_result, self.BINNER_BIN_RESULT_DIR)
        path_to_summary_file = path_to_cocacola_result_bins + 'binned_contig.summary'
        with open(path_to_summary_file, 'w+') as f:
            f.write("Bin name\tCompleteness\tGenome size\tGC content\n")
            for dirname, subdirs, files in os.walk(path_to_cocacola_result_bins):
                for file in files:
                    if file.endswith('.fasta'):
                        genome_bin_fna_file = os.path.join(self.BINNER_BIN_RESULT_DIR, file)
                        bbstats_output_file = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY,
                                                           genome_bin_fna_file).split('.fasta')[0] + ".bbstatsout"
                        bbstats_output = self.generate_stats_for_genome_bins(task_params,
                                                                             genome_bin_fna_file,
                                                                             bbstats_output_file)
                        f.write('{}\t0\t{}\t{}\n'.format(genome_bin_fna_file.split("/")[-1],
                                                         bbstats_output['contig_bp'],
                                                         bbstats_output['gc_avg']))
        f.close()
        log('Finished make_binned_contig_summary_file_for_binning_apps function')

    def generate_output_file_list(self, result_directory):
        """
        generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cocacola_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:

            for dirname, subdirs, files in os.walk(result_directory):
                for file in files:
                    if (file.endswith('.sam') or
                        file.endswith('.bam') or
                        file.endswith('.bai') or
                       file.endswith('.summary')):
                            continue
                    if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                            continue
                    zip_file.write(os.path.join(dirname, file), file)
                if (dirname.endswith(self.BINNER_BIN_RESULT_DIR)):
                    baseDir = os.path.basename(dirname)
                    for file in files:
                        full = os.path.join(dirname, file)
                        zip_file.write(full, os.path.join(baseDir, file))

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'Files generated by CONCOCT App'})

        return output_files

    def generate_html_report(self, result_directory, assembly_ref, binned_contig_obj_ref):
        """
        generate_html_report: generate html summary report
        """

        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        # get summary data from existing assembly object and bins_objects
        Summary_Table_Content = ''
        Overview_Content = ''
        (binned_contig_count, input_contig_count, total_bins_count) = \
            self.generate_overview_info(assembly_ref, binned_contig_obj_ref, result_directory)

        Overview_Content += '<p>Binned contigs: {}</p>'.format(binned_contig_count)
        Overview_Content += '<p>Input contigs: {}</p>'.format(input_contig_count)
        Overview_Content += '<p>Number of bins: {}</p>'.format(total_bins_count)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          Overview_Content)
                report_template = report_template.replace('Summary_Table_Content',
                                                          Summary_Table_Content)
                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for kb_cocacola App'})
        return html_report

    def generate_overview_info(self, assembly_ref, binned_contig_obj_ref, result_directory):
        """
        _generate_overview_info: generate overview information from assembly and binnedcontig
        """

        # get assembly and binned_contig objects that already have some data populated in them
        assembly = self.dfu.get_objects({'object_refs': [assembly_ref]})['data'][0]
        binned_contig = self.dfu.get_objects({'object_refs': [binned_contig_obj_ref]})['data'][0]

        input_contig_count = assembly.get('data').get('num_contigs')
        binned_contig_count = 0
        total_bins_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))

        return (binned_contig_count, input_contig_count, total_bins_count)

    def generate_report(self, binned_contig_obj_ref, task_params):
        """
        generate_report: generate summary report
        """
        log('Generating report')

        result_directory = os.path.join(self.scratch, "cocacola_output_dir")

        task_params['result_directory'] = result_directory

        output_files = self.generate_output_file_list(task_params['result_directory'])

        output_html_files = self.generate_html_report(task_params['result_directory'],
                                                      task_params['assembly_ref'],
                                                      binned_contig_obj_ref)

        report_params = {
            'message': '',
            'workspace_name': task_params['workspace_name'],
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 266,
            'report_object_name': 'kb_cocacola_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def create_dict_from_depth_file(self, depth_file_path):
        # keep contig order (required by metabat2)
        depth_file_dict = {}
        with open(depth_file_path, 'r') as f:
            header = f.readline().rstrip().split("\t")
            # print('HEADER1 {}'.format(header))
            # map(str.strip, header)
            for line in f:
                # deal with cases were fastq name has spaces.Assume first
                # non white space word is unique and use this as ID.
                # line = line.rstrip()
                vals = line.rstrip().split("\t")
                if ' ' in vals[0]:
                    ID = vals[0].split()[0]
                else:
                    ID = vals[0]
                depth_file_dict[ID] = vals[1:]
            depth_file_dict['header'] = header
        return depth_file_dict

    def run_cocacola(self, task_params):
        """
        run_cocacola: cocacola app

        required params:
            assembly_ref: Metagenome assembly object reference
            binned_contig_name: BinnedContig object name and output file header
            workspace_name: the name of the workspace it gets saved to.
            reads_list: list of reads object (PairedEndLibrary/SingleEndLibrary)
            upon which CONCOCT will be run

        optional params:
            min_contig_length: minimum contig length; default 1000

            ref: https://github.com/BinPro/CONCOCT/blob/develop/README.md
        """
        log('--->\nrunning CocacolaUtil.run_cocacola\n' +
            'task_params:\n{}'.format(json.dumps(task_params, indent=1)))

        self._validate_run_cocacola_params(task_params)

        # get assembly
        contig_file = self._get_contig_file(task_params['assembly_ref'])
        task_params['contig_file_path'] = contig_file

        # clean the assembly file so that there are no spaces in the fasta headers
        assembly_clean = self.retrieve_and_clean_assembly(task_params)

        assembly_clean_temp = self.filter_contigs_by_length(assembly_clean, task_params['min_contig_length'])

        task_params['contig_file_path'] = assembly_clean_temp
        assembly_clean = assembly_clean_temp  # need to clean this up, ugly redundant variable usage

        # get reads
        (reads_list_file, read_type) = self.stage_reads_list_file(task_params['reads_list'])
        task_params['read_type'] = read_type
        task_params['reads_list_file'] = reads_list_file

        # prep result directory
        result_directory = os.path.join(self.scratch, self.BINNER_RESULT_DIRECTORY)
        self._mkdir_p(result_directory)

        cwd = os.getcwd()
        log('changing working dir to {}'.format(result_directory))
        os.chdir(result_directory)

        # run alignments, and update input contigs to use the clean file
        # this function has an internal loop to generate a sorted bam file for each input read file
        self.generate_alignment_bams(task_params, assembly_clean)

        # not used right now
        # depth_file_path = self.generate_make_coverage_table_command(task_params, sorted_bam_file_list)
        # depth_dict = self.create_dict_from_depth_file(depth_file_path)

        # run cocacola prep, cut up fasta input
        self.generate_cocacola_cut_up_fasta_command(task_params)

        # run cococola prep, generate coverage tables from bam
        self.generate_cocacola_input_table_from_bam(task_params)

        # run cococola prep, generate kmer table
        self.generate_cocacola_kmer_composition_table(task_params)

        # run cocacola prep and cocacola
        self.generate_cocacola_command(task_params)

        # run command to add header to output file
        self.add_header_to_post_clustering_file(task_params)

        # run cocacola post cluster merging command
        self.generate_cocacola_post_clustering_merging_command(task_params)

        # run extract bins command
        self.generate_cocacola_extract_fasta_bins_command(task_params)

        # run fasta renaming
        self.rename_and_standardize_bin_names(task_params)

        # make binned contig summary file
        self.make_binned_contig_summary_file_for_binning_apps(task_params)

        # file handling and management
        os.chdir(cwd)
        log('changing working dir to {}'.format(cwd))

        log('Saved result files to: {}'.format(result_directory))
        log('Generated files:\n{}'.format('\n'.join(os.listdir(result_directory))))

        # make new BinnedContig object and upload to KBase
        generate_binned_contig_param = {
            'file_directory': os.path.join(result_directory, self.BINNER_BIN_RESULT_DIR),
            'assembly_ref': task_params['assembly_ref'],
            'binned_contig_name': task_params['binned_contig_name'],
            'workspace_name': task_params['workspace_name']
        }

        binned_contig_obj_ref = \
            self.mgu.file_to_binned_contigs(generate_binned_contig_param).get('binned_contig_obj_ref')

        # generate report
        reportVal = self.generate_report(binned_contig_obj_ref, task_params)
        returnVal = {
            'result_directory': result_directory,
            'binned_contig_obj_ref': binned_contig_obj_ref
        }
        returnVal.update(reportVal)

        return returnVal
Ejemplo n.º 21
0
def process_kbase_objects(host_ref, virus_ref, shared_folder, callback,
                          workspace, token):
    """
    Convert KBase object(s) into usable files for VirMatcher
    :param host_ref: Putative host / microbial genomes with KBase '#/#/#' used to describe each object
    :param virus_ref: Viral genomes with KBase '#/#/#' used to describe each object
    :param shared_folder: KBase job node's "working" directory, where actual files exist
    :param callback:
    :param workspace: Workspace name
    :param token: Job token
    :return:
    """

    dfu = DataFileUtil(callback, token=token)

    ws = Workspace(workspace, token=token)

    mgu = MetagenomeUtils(callback, token=token)

    au = AssemblyUtil(callback, token=token)

    # Need to determine KBase type in order to know how to properly proceed
    host_type = ws.get_object_info3({'objects': [{
        'ref': host_ref
    }]})['infos'][0][2].split('-')[0]
    virus_type = ws.get_object_info3({'objects': [{
        'ref': virus_ref
    }]})['infos'][0][2].split('-')[0]

    logging.info(f'Potential hosts identified as: {host_type}')
    logging.info(f'Viruses identified as: {virus_type}')

    # Create new directory to house virus and host files
    host_dir = Path(shared_folder) / 'host_files'
    if not host_dir.exists():
        os.mkdir(host_dir)

    host_count = 0

    if host_type == 'KBaseGenomeAnnotations.Assembly':  # No info about individual genomes, so treat each as organism
        host_fps = au.get_assembly_as_fasta(
            {'ref':
             host_ref})['path']  # Consists of dict: path + assembly_name

        logging.info(
            f'Identified {host_type}. Each sequence will be treated as a separate organism.'
        )

        records = SeqIO.parse(host_fps, 'fasta')

        for record in records:
            host_count += 1
            tmp_fp = host_dir / f'{record.id}.fasta'  # TODO Illegal filenames?
            SeqIO.write([record], tmp_fp, 'fasta')

    elif host_type == 'KBaseGenomes.Genomes':  # TODO Genomes?!
        genome_data = ws.get_objects2({'objects': [{
            'ref': host_ref
        }]})['data'][0]['data']
        genome_data.get('contigset_ref') or genome_data.get('assembly_ref')

    # elif host_type == 'KBaseSets.GenomeSet'

    elif host_type == 'KBaseSets.AssemblySet':
        obj_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]

        for subobj in obj_data['data']['items']:
            host_fp = au.get_assembly_as_fasta({'ref': subobj['ref']})['path']

            if os.path.splitext(host_fp)[-1] != 'fasta':
                # Ensure extension always = fasta
                target_fn = os.path.splitext(
                    os.path.basename(host_fp))[0].strip('_') + '.fasta'
            else:
                target_fn = os.path.basename(host_fp).strip('_')

            shutil.copyfile(host_fp, host_dir / target_fn)
            host_count += 1

    elif host_type == 'KBaseMetagenomes.BinnedContigs':  # This is what we want!
        host_kbase_dir = mgu.binned_contigs_to_file({
            'input_ref': host_ref,
            'save_to_shock': 0
        })['bin_file_directory']  # Dict of bin_file_dir and shock_id

        for (dirpath, dirnames, fns) in os.walk(
                host_kbase_dir):  # Dirnames = all folders under dirpath
            for fn in fns:
                if os.path.splitext(fn)[-1] != 'fasta':
                    fn = os.path.splitext(fn)[0] + '.fasta'
                fp = Path(dirpath) / fn
                shutil.copy(fp, host_dir)
                host_count += 1

    else:
        raise ValueError(f'{host_type} is not supported.')

    logging.info(f'{host_count} potential host genomes were identified.')

    virus_count = 0

    if virus_type == 'KBaseGenomeAnnotations.Assembly':
        virus_fps = au.get_assembly_as_fasta({'ref': virus_ref})['path']

        records = SeqIO.parse(virus_fps, 'fasta')
        virus_count = len(list(records))

        # for record in records:
        #     virus_count += 1
        # tmp_fp = virus_dir / f'{record.id}.fasta'
        # SeqIO.write([record], tmp_fp, 'fasta')

    else:
        raise ValueError(f'{virus_type} is not supported.')

    logging.info(f'{virus_count} potential viral genomes were identified.')

    # TODO Do we even need any of this data? We don't care about what the sequences are called

    # host_data = dfu.get_objects({'object_refs': [host_ref]})['data'][0]
    # virus_data = dfu.get_objects({'object_refs': [virus_ref]})['data'][0]

    return host_dir, virus_fps
Ejemplo n.º 22
0
class TypeToFasta:

    def __init__(self, callback_url, scratch, ws_url):
        self.ws_url = ws_url
        self.callback_url = callback_url
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)
        self.mgu = MetagenomeUtils(callback_url)

    def type_to_fasta(self, ctx, ref_lst):


        fasta_dict = dict()
        fasta_array = []
        atf = AssemblyToFasta(self.callback_url, self.scratch)

        # Get type info for each ref in ref_lst
        for idx, ref in enumerate(ref_lst):

            upas = []
            obj = {"ref": ref}
            obj_info = self.ws_url.get_object_info3({"objects": [obj]})
            obj_type = obj_info["infos"][0][2]


            # From type info get object
            if 'KBaseSets.GenomeSet' in obj_type:
                obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0]
                upas = [gsi['ref'] for gsi in obj_data['data']['items']]
            elif 'KBaseSearch.GenomeSet' in obj_type:
                obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0]
                upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
            elif "KBaseGenomes.Genome" in obj_type:
                upas = [ref]

            elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
                faf = [atf.assembly_as_fasta(ctx, obj)]
                fasta_array.extend([faf[0]['path'], ref])

            elif "KBaseSets.AssemblySet" in obj_type:
                fasta_paths = []

                obj_data = self.dfu.get_objects({"object_refs": [ref]})['data'][0]

                for item_upa in obj_data['data']['items']:
                    faf = [atf.assembly_as_fasta(ctx, {"ref": item_upa['ref']})]
                    fasta_paths.extend([faf[0]['path'], item_upa['ref']])
                    fasta_array = fasta_paths

            elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
                fasta_paths = []

                bin_file_dir = self.mgu.binned_contigs_to_file({'input_ref': ref, 'save_to_shock': 0})['bin_file_directory']
                for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
                    for fasta_file in filenames:
                        fasta_path = os.path.join(self.scratch, fasta_file)
                        copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path)
                        fasta_paths.extend([fasta_path, ref])
                    break
                fasta_array = fasta_paths

            if upas:
                for genome_upa in upas:
                    genome_data = self.ws_url.get_objects2({'objects': [{"ref": genome_upa}]})['data'][0]['data']
                    assembly_upa = genome_upa + ';' + str(genome_data.get('contigset_ref') or genome_data.get('assembly_ref'))
                    faf = [atf.assembly_as_fasta(ctx, {'ref': assembly_upa})]
                    fasta_array.extend([faf[0]['path'], assembly_upa])

        # return dictionary of FASTA
        fasta_dict["FASTA"] = fasta_array

        return fasta_dict