コード例 #1
0
ファイル: save.py プロジェクト: man4ish/MotifUtils
class MotifSaver:
    def __init__(self, callback, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def saveMotifSet(self, motifset, params):
        if isinstance(motifset, list):
            logging.info('Saving multiple motifset objects...')
            # TODO: accept lists of constructed motif set object
            # TODO: check if list is a save_objects list or list of motifsets process accordingly
            # TODO: accept list of object names
            obj = self.dfu.save_objects({
                'id':
                self.dfu.ws_name_to_id(params['ws_name']),
                'objects': [{
                    'type': 'KBaseGeneRegulation.MotifSet',
                    'data': motifset[0],
                    'name': str(uuid.uuid4())
                }]
            })[0]

            return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4])
        elif isinstance(motifset, dict):
            logging.info('Saving a single motifset object...')
            # TODO: accept object name
            obj = self.dfu.save_objects({
                'id':
                self.dfu.ws_name_to_id(params['ws_name']),
                'objects': [{
                    'type': 'KBaseGeneRegulation.MotifSet',
                    'data': motifset,
                    'name': str(uuid.uuid4())
                }]
            })[0]

            return str(obj[6]) + "/" + str(obj[0]) + "/" + str(obj[4])
        else:
            raise ValueError(
                'Input to motif saver should be either: ' + '\n'
                '1. a list of constructed KBaseGeneRegulation.MotifSet objects (dictionary)\n'
                +
                '2. a single KBaseGeneRegulation.MotifSet object (dictionary)')
コード例 #2
0
    def test_AssemblySet_input(self):

        # Initiate empty data dictionaries and get data_util
        dfu = DataFileUtil(self.callback_url)
        assembly_dict = dict()
        assembly_set_dict = dict()
        dfu_dict = dict()
        dfu_dict_2 = dict()
        # Get workspace id and name
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # FASTA to assembly object
        Fasta_assembly_dict = {
            "path": "/kb/module/work/tmp/NC_021490.fasta",
            "assembly_name": "test_assembly"
        }
        params = {
            "file": Fasta_assembly_dict,
            "workspace_name": wsName,
            "assembly_name": "test_assembly"
        }
        ref = self.getImpl().save_assembly_from_fasta(self.ctx, params)

        # Create assembly data dictionaries
        assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]})
        assembly_set_dict.update({
            "description": " ",
            "items": [assembly_dict]
        })
        # Create DataFileUtil dictionaries
        dfu_dict.update({
            "type": "KBaseSets.AssemblySet",
            "data": assembly_set_dict,
            "name": "Assembly_Test"
        })
        dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]})

        # Create assembly set object
        assembly_set_obj = dfu.save_objects(dfu_dict_2)
        assembly_set_ref = [
            str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) +
            '/' + str(assembly_set_obj[0][4])
        ]

        # Get FASTA
        ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
コード例 #3
0
    def UploadFromMdscan(self, callback_url, params):
        """
          :param params: instance of type "UploadmfmdInParams" -> structure:
             parameter "path" of String, parameter "ws_name" of String,
             parameter "obj_name" of String
          :returns: instance of type "UploadOutput" -> structure: parameter
             "obj_ref" of String
          """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMdscan
        print('Extracting motifs')
        motifList = self.parse_mdscan_output(params['path'])
        print(motifList)

        MSO = {}
        MSO = motifList

        dfu = DataFileUtil(callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #exit("test")
        #END UploadFromMdscan

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFrommfmd return value ' +
                             'output is not type dict as required.')

        # return the results
        return [output]
コード例 #4
0
ファイル: roary_report.py プロジェクト: psdehal/Roary
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name,
                     pangenome_name):
    """
    params:
        cb_url         : callback url
        scratch        : folder path to Pangenome object 
        pangenome      : KBaseGenomes.Pangenome like object
        workspace_name : workspace name
        pangenome_name : Pangenome display name
    Returns:
        pangenome_ref: Pangenome workspace reference
        pangenome_info: info on pangenome object
    """
    dfu = DataFileUtil(cb_url)
    meta = {}
    hidden = 0

    # dump pangenome to scratch for upload
    # data_path = os.path.join(scratch, pangenome_name + '.json')
    # json.dump(pangenome, open(data_path, 'w'))

    if isinstance(workspace_name, int) or workspace_name.isdigit():
        workspace_id = workspace_name
    else:
        workspace_id = dfu.ws_name_to_id(workspace_name)

    save_params = {
        'id':
        workspace_id,
        'objects': [{
            'type': 'KBaseGenomes.Pangenome',
            'data': Pangenome,
            'name': pangenome_name,
            'meta': meta,
            'hidden': hidden
        }]
    }

    info = dfu.save_objects(save_params)[0]

    ref = "{}/{}/{}".format(info[6], info[0], info[4])
    print("Pangenome saved to {}".format(ref))

    return {'pangenome_ref': ref, 'pangenome_info': info}
コード例 #5
0
    def test_metagenome_binned_input(self):

        # Setup
        path = "data/binnedContigs.json"
        ws_path = '/kb/module/work/tmp'
        assembly_path = "data/CCESR16_SPAdes.assembly.fa"
        shutil.copy2(path, ws_path)
        shutil.copy2(assembly_path, ws_path)
        dfu = DataFileUtil(self.callback_url)
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # FASTA to assembly object
        Fasta_assembly_dict = {
            "path": '/kb/module/work/tmp/CCESR16_SPAdes.assembly.fa',
            "assembly_name": "meta_assembly"
        }
        assembly_params = {
            "file": Fasta_assembly_dict,
            "workspace_name": wsName,
            "assembly_name": "test_assembly"
        }
        meta_assembly_ref = self.getImpl().save_assembly_from_fasta(
            self.ctx, assembly_params)[0]

        # Upload genome, copy genome to workspace folder, & genome data dictionary input
        meta_data = json.load(open(path))
        meta_data['assembly_ref'] = meta_assembly_ref
        meta_dict = [{
            'name': 'Meta_test',
            'type': 'KBaseMetagenomes.BinnedContigs',
            'data': meta_data
        }]

        # Create .Genome object in workspace with save_objects
        binned_obj = dfu.save_objects({'id': ws_id, 'objects': meta_dict})

        binned_obj_info = binned_obj[0]
        binned_obj_ref = str(binned_obj_info[6]) + '/' + str(
            binned_obj_info[0]) + '/' + str(binned_obj_info[4])

        # Get FASTA
        ret = self.getImpl().get_fastas(self.callback_url, [binned_obj_ref])
コード例 #6
0
class PDBUtil:

    def _validate_import_pdb_file_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        # check for required parameters
        for p in ['structure_name', 'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file(
                {'shock_id': params['input_shock_id'],
                 'file_path': self.scratch}).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file(
                        {'staging_file_subdir_path': params.get('input_staging_file_path')}
                        ).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params.get('workspace_name'), params.get('structure_name')

    def _file_to_data(self, file_path):
        """Do the PDB conversion"""
        pdb1 = file_path
        structure = parser.get_structure("test", pdb1)
        model = structure[0]
        chain_no = 0
        res_no = 0
        atom_no = 0
        pp_list = []
        pp_no = 0        
        for model in structure:
            for chain in model:
                chain_no += 1
        for residue in model.get_residues():
            if PDB.is_aa(residue):
                res_no += 1
            for atom in residue.get_atoms():
                atom_no += 1


        for pp in ppb.build_peptides(structure):
            pp_no += 1
            my_seq= pp.get_sequence()
            pp_list += str(my_seq)
        seq = ''.join(pp_list)
        return {
            'name': os.path.basename(file_path),
            'num_chains': chain_no,
            'num_residues': res_no,
            'num_atoms': atom_no,
            'protein': {
                'id': os.path.basename(file_path),
                'sequence': seq,
                'md5': hashlib.md5(seq.encode()).hexdigest()
            },
        }

    def _get_pdb_shock_id(self, obj_ref):
        """Return the shock id for the PDB file"""
        obj_data = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]['data']
        return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id']

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(file_to_shock_params)['handle']['hid']

        return shock_id

    def _generate_html_report(self, header_str, table_str):
        #TODO: make this work with the PDB viewer

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'templates', 'viewer_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('//HEADER_STR', header_str)
                report_template = report_template.replace('//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Search Matrix App'})

        return html_report

    def _generate_report(self, pdb_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """
        # included as an example. Replace with your own implementation
        # output_html_files = self._generate_html_report(header_str, table_str)

        report_params = {'message': 'You uploaded a PDB file!',
                         #'html_links': output_html_files,
                         #'direct_html_link_index': 0,
                         'objects_created': [{'ref': pdb_obj_ref,
                                              'description': 'Imported PDB'}],
                         'workspace_name': workspace_name,
                         'report_object_name': 'import_pdb_from_staging_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.hs = AbstractHandle(config['handle-service-url'])

    def import_model_pdb_file(self, params):

        file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path)
        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        logging.info(data)

        info = self.dfu.save_objects({
            'id': workspace_id,
            'objects': [
                {'type': 'KBaseStructure.ModelProteinStructure',
                 'name': pdb_name,
                 'data': data}]
        })[0]
        obj_ref = f"{info[6]}/{info[0]}/{info[4]}"

        returnVal = {'structure_obj_ref': obj_ref}

        report_output = self._generate_report(obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_pdb(self, params):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def structure_to_pdb_file(self, params):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        shock_id = self._get_pdb_shock_id(params['input_ref'])
        file_path = self.dfu.shock_to_file({
            'shock_id': shock_id,
            'file_path': params['destination_dir'],
            'unpack': 'uncompress'
        })['file_path']

        return {'file_path': file_path}
コード例 #7
0
ファイル: plant_fbaImpl.py プロジェクト: kbaseapps/plant_fba
class plant_fba:
    '''
    Module Name:
    plant_fba

    Module Description:
    A KBase module: plant_fba
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.1.1"
    GIT_URL = "[email protected]:kbaseapps/plant_fba.git"
    GIT_COMMIT_HASH = "6f0b5af5a458c5158b9f0007399653a256edcd14"

    #BEGIN_CLASS_HEADER

    def convert_search_role(self, role):

        searchrole = role

        #Remove spaces
        searchrole = searchrole.strip()
        searchrole = searchrole.replace(' ', '')

        #Make all lowercase
        searchrole = searchrole.lower()

        #Remove EC and parentheses
        searchrole = re.sub(r'\(ec[\d-]+\.[\d-]\.[\d-]\.[\d-]\)', '',
                            searchrole)

        return searchrole

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.shared_folder = config['scratch']
        self.config = config
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def integrate_abundances_with_metabolism(self, ctx, input_params):
        """
        :param input_params: instance of type "IntegrateAbundancesParams"
           (@optional input_columns) -> structure: parameter "input_ws" of
           String, parameter "input_expression_matrix" of String, parameter
           "input_fbamodel" of String, parameter "input_columns" of String,
           parameter "output_reaction_matrix" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output_report
        #BEGIN integrate_abundances_with_metabolism

        app = IntegrateAppImpl(self.config, ctx, input_params)
        output_report = app.integrate_abundances_with_metabolism()

        #END integrate_abundances_with_metabolism

        # At some point might do deeper type checking...
        if not isinstance(output_report, dict):
            raise ValueError(
                'Method integrate_abundances_with_metabolism return value ' +
                'output_report is not type dict as required.')
        # return the results
        return [output_report]

    def reconstruct_plant_metabolism(self, ctx, input_params):
        """
        :param input_params: instance of type "ReconstructMetabolismParams"
           -> structure: parameter "input_ws" of String, parameter
           "input_genome" of String, parameter "output_ws" of String,
           parameter "output_fbamodel" of String, parameter "template" of
           String, parameter "template_ws" of String
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output_report
        #BEGIN reconstruct_plant_metabolism

        #Compile biochemistry information
        abbrev_cpt_dict = dict()
        cpt_name_dict = dict()
        with open('/kb/module/data/compartments.txt') as fh:
            for line in fh.readlines():
                line = line.strip('\r\n')
                array = line.split('\t')
                abbrev_cpt_dict[array[3]] = array[0]
                cpt_name_dict[array[0]] = array[2]

        # Fetch and parse biochemistry data
        with open(
                os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry",
                             "reactions.json")) as msd_rxn_fh:
            MSD_reactions = json.load(msd_rxn_fh)
        MSD_reactions_dict = dict()
        for entry in MSD_reactions:
            MSD_reactions_dict[entry['id']] = entry

        with open(
                os.path.join("/kb/module/ModelSEEDDatabase", "Biochemistry",
                             "compounds.json")) as msd_rxn_fh:
            MSD_compounds = json.load(msd_rxn_fh)
        MSD_compounds_dict = dict()
        for entry in MSD_compounds:
            MSD_compounds_dict[entry['id']] = entry

        # Retrieve Template, and compile indexes of roles and complexes
        if ('template_ws' not in input_params
                or input_params['template_ws'] == ''):
            input_params['template_ws'] = 'NewKBaseModelTemplates'

        if ('template' not in input_params or input_params['template'] == ''):
            input_params['template'] = 'PlantModelTemplate'

        template_ref = input_params['template_ws'] + '/' + input_params[
            'template']
        template_obj = self.dfu.get_objects({'object_refs':
                                             [template_ref]})['data'][0]

        searchroles_dict = dict()
        roles_dict = dict()
        for role in template_obj['data']['roles']:
            searchrole = self.convert_search_role(role['name'])
            searchroles_dict[searchrole] = role['id']
            roles_dict[role['id']] = role

        complex_dict = dict()
        for cpx in template_obj['data']['complexes']:
            complex_dict[cpx['id']] = cpx

        #Retrieve Genome annotation as dict
        role_cpt_ftr_dict = dict()
        genome_ref = input_params['input_ws'] + '/' + input_params[
            'input_genome']
        genome_obj = self.dfu.get_objects({'object_refs':
                                           [genome_ref]})['data'][0]
        for feature in genome_obj['data']['features']:
            if ('functions' in feature and len(feature['functions']) > 0):
                for function_comment in feature['functions']:

                    #Split for comments and retrieve compartments
                    function_cpt_list = function_comment.split("#")
                    for i in range(len(function_cpt_list)):
                        function_cpt_list[i] = function_cpt_list[i].strip()

                    function = function_cpt_list.pop(0)
                    roles = re.split("\s*;\s+|\s+[\@\/]\s+", function)
                    for role in roles:

                        searchrole = self.convert_search_role(role)
                        if (searchrole not in searchroles_dict):
                            continue

                        role_id = searchroles_dict[searchrole]

                        if (role_id not in role_cpt_ftr_dict):
                            role_cpt_ftr_dict[role_id] = dict()

                        #Defaults to cytosol
                        if (len(function_cpt_list) == 0):
                            function_cpt_list.append('cytosol')

                        for cpt in function_cpt_list:
                            abbrev_cpt = cpt
                            if (cpt not in abbrev_cpt_dict):
                                print(
                                    "No compartmental abbreviation found for "
                                    + cpt)
                            else:
                                abbrev_cpt = abbrev_cpt_dict[cpt]

                            if (abbrev_cpt not in role_cpt_ftr_dict[role_id]):
                                role_cpt_ftr_dict[role_id][abbrev_cpt] = dict()

                            role_cpt_ftr_dict[role_id][abbrev_cpt][
                                feature['id']] = 1

        #Default dictionaries for objects needed for a model reaction
        default_mdlcpt_dict = {
            'id': 'u0',
            'label': 'unknown',
            'pH': 7,
            'potential': 0,
            'compartmentIndex': 0,
            'compartment_ref': '~//'
        }

        default_mdlcpd_dict = {
            'id': '',
            'charge': 0,
            'formula': '',
            'name': '',
            'compound_ref': '',
            'modelcompartment_ref': '~/modelcompartments/id/u0'
        }

        default_mdlrxn_dict = {
            'id': '',
            'direction': '',
            'protons': 0,
            'name': '',
            'reaction_ref': '',
            'probability': 0,
            'modelcompartment_ref': '',
            'modelReactionReagents': [],
            'modelReactionProteins': []
        }

        #Lookup dictionaries for compartments and compounds, to avoid duplicating them
        mdlcpts_dict = dict()
        mdlcpds_dict = dict()

        #Reaction complexes for the generated table
        rxncplxs_dict = dict()

        #Create New, but Empty Plant Reconstruction
        new_model_obj = {
            'id': input_params['output_fbamodel'],
            'type': "GenomeScale",
            'source': "KBase",
            'source_id': "PlantSEED_v2",
            'template_ref': template_ref,
            'genome_ref': genome_ref,
            'name': input_params['output_fbamodel'],
            'modelreactions': [],
            'modelcompounds': [],
            'modelcompartments': [],
            'biomasses': [],
            'gapgens': [],
            'gapfillings': []
        }

        for template_rxn in template_obj['data']['reactions']:
            if (template_rxn['type'] == 'gapfilling'):
                continue

            template_rxn_cpt = template_rxn['templatecompartment_ref'].split(
                '/')[-1]

            proteins_list = list()
            prots_str_list = list()
            #complex_ref and source are optional fields
            default_protein_dict = {
                'note': template_rxn['type'],
                'complex_ref': '',
                'modelReactionProteinSubunits': []
            }
            for cpx_ref in template_rxn['templatecomplex_refs']:
                cpx_id = cpx_ref.split('/')[-1]
                model_complex_ref = "~/template/complexes/id/" + cpx_id

                new_protein_dict = copy.deepcopy(default_protein_dict)
                new_protein_dict['complex_ref'] = model_complex_ref

                complex_present = False
                subunits_list = list()
                default_subunit_dict = {
                    'role': '',
                    'triggering': 0,
                    'optionalSubunit': 0,
                    'note': '',
                    'feature_refs': []
                }
                matched_role_dict = dict()

                for cpxrole in complex_dict[cpx_id]['complexroles']:
                    role_id = cpxrole['templaterole_ref'].split('/')[-1]

                    if (role_id in role_cpt_ftr_dict):

                        for role_cpt in role_cpt_ftr_dict[role_id]:
                            role_cpt_present = False
                            if (template_rxn_cpt == role_cpt
                                    and cpxrole['triggering'] == 1):
                                complex_present = True
                                role_cpt_present = True

                            if (role_cpt_present == True):
                                new_subunit_dict = copy.deepcopy(
                                    default_subunit_dict)
                                new_subunit_dict['triggering'] = cpxrole[
                                    'triggering']
                                new_subunit_dict['optionalSubunit'] = cpxrole[
                                    'optional_role']
                                new_subunit_dict['role'] = roles_dict[role_id][
                                    'name']

                                if (len(roles_dict[role_id]['features']) > 0):
                                    new_subunit_dict[
                                        'note'] = 'Features characterized and annotated'
                                else:
                                    #This never happens as of Fall 2019
                                    print("Warning: " +
                                          roles_dict[role_id]['name'] +
                                          " is apparently uncharacterized!")
                                    new_subunit_dict[
                                        'note'] = 'Features uncharacterized but annotated'
                                    pass

                                for ftr in role_cpt_ftr_dict[role_id][
                                        role_cpt]:
                                    feature_ref = "~/genome/features/id/" + ftr
                                    new_subunit_dict['feature_refs'].append(
                                        feature_ref)

                                matched_role_dict[role_id] = 1
                                subunits_list.append(new_subunit_dict)

                    if (role_id not in role_cpt_ftr_dict
                            and template_rxn['type'] == 'universal'):
                        #This should still be added, with zero features to indicate the universality of the role in plant primary metabolism
                        new_subunit_dict = copy.deepcopy(default_subunit_dict)
                        new_subunit_dict['triggering'] = cpxrole['triggering']
                        new_subunit_dict['optionalSubunit'] = cpxrole[
                            'optional_role']
                        new_subunit_dict['role'] = roles_dict[role_id]['name']

                        #Un-necessary, but explicitly stated
                        new_subunit_dict['feature_refs'] = []

                        if (len(roles_dict[role_id]['features']) == 0):
                            new_subunit_dict[
                                'note'] = 'Features uncharacterized and unannotated'
                        else:
                            #As of Fall 2019, this includes two reactions
                            new_subunit_dict[
                                'note'] = "Features characterized but unannotated"
                            print("Missing annotation: ", cpx_id, role_id,
                                  roles_dict[role_id])

                        matched_role_dict[role_id] = 1
                        subunits_list.append(new_subunit_dict)

                if (complex_present == True):
                    #Check to see if members of a detected protein complex are missing
                    #and add them if so, to round off the complex
                    #This will only happen to a complex that is conditional (see above)
                    for cpxrole in complex_dict[cpx_id]['complexroles']:
                        role_id = cpxrole['templaterole_ref'].split('/')[-1]

                        if (role_id not in matched_role_dict):
                            print("Gapfilling complex: ", cpx_id,
                                  roles_dict[role_id])
                            new_subunit_dict = copy.deepcopy(
                                default_subunit_dict)
                            new_subunit_dict['triggering'] = cpxrole[
                                'triggering']
                            new_subunit_dict['optionalSubunit'] = cpxrole[
                                'optional_role']
                            new_subunit_dict[
                                'note'] = "Complex-based-gapfilling"
                            subunits_list.append(new_subunit_dict)

                if (len(subunits_list) > 0):
                    new_protein_dict[
                        'modelReactionProteinSubunits'] = subunits_list

                    #Store features and subunits as complex string for table
                    subs_str_list = list()
                    for subunit in subunits_list:
                        ftrs_str_list = list()
                        for ftr_ref in subunit['feature_refs']:
                            ftr = ftr_ref.split('/')[-1]
                            ftrs_str_list.append(ftr)
                        ftr_str = "(" + ", ".join(ftrs_str_list) + ")"
                        subs_str_list.append(ftr_str)
                    sub_str = "[" + ", ".join(subs_str_list) + "]"
                    prots_str_list.append(sub_str)

                proteins_list.append(new_protein_dict)

            prot_str = ", ".join(prots_str_list)

            #This is important, we need to use role-based annotation to determine whether
            #a reaction should even be added to the model
            if (template_rxn['type'] == 'conditional'
                    and len(proteins_list) == 0):
                continue

            #If the check passes, then, here, we instantiate the actual reaction that goes into the model
            new_mdlrxn_id = template_rxn['id'] + '0'
            new_mdlcpt_id = template_rxn_cpt + '0'
            base_rxn_id = template_rxn['id'].split('_')[0]

            #For table
            rxncplxs_dict[new_mdlrxn_id] = prot_str

            new_mdlrxn_dict = copy.deepcopy(default_mdlrxn_dict)
            new_mdlrxn_dict['id'] = new_mdlrxn_id

            new_mdlrxn_dict['name'] = MSD_reactions_dict[base_rxn_id][
                'abbreviation']
            if (MSD_reactions_dict[base_rxn_id]['abbreviation'] == ""):
                new_mdlrxn_dict['name'] = base_rxn_id

            new_mdlrxn_dict['direction'] = template_rxn['direction']
            new_mdlrxn_dict[
                'reaction_ref'] = '~/template/reactions/id/' + template_rxn[
                    'id']
            new_mdlrxn_dict[
                'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id

            #Here we check and instantiate a new modelcompartment
            if (new_mdlcpt_id not in mdlcpts_dict):
                new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict)
                new_mdlcpt_dict['id'] = new_mdlcpt_id
                new_mdlcpt_dict['label'] = cpt_name_dict[template_rxn_cpt]
                new_mdlcpt_dict[
                    'compartment_ref'] = '~/template/compartments/id/' + template_rxn_cpt
                mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict

            #Add Proteins as previously determined
            new_mdlrxn_dict['modelReactionProteins'] = proteins_list

            #Add Reagents
            for template_rgt in template_rxn['templateReactionReagents']:
                template_rgt_cpd_cpt_id = template_rgt[
                    'templatecompcompound_ref'].split('/')[-1]
                (template_rgt_cpd,
                 template_rgt_cpt) = template_rgt_cpd_cpt_id.split('_')

                #Check and add new model compartment
                new_mdlcpt_id = template_rgt_cpt + '0'
                if (new_mdlcpt_id not in mdlcpts_dict):
                    new_mdlcpt_dict = copy.deepcopy(default_mdlcpt_dict)
                    new_mdlcpt_dict['id'] = new_mdlcpt_id
                    new_mdlcpt_dict['label'] = cpt_name_dict[template_rgt_cpt]
                    new_mdlcpt_dict[
                        'compartment_ref'] = '~/template/compartments/id/' + template_rgt_cpt
                    mdlcpts_dict[new_mdlcpt_id] = new_mdlcpt_dict

                #Add new model compounds
                new_mdlcpd_id = template_rgt_cpd_cpt_id + '0'
                base_cpd_id = template_rgt_cpd_cpt_id.split('_')[0]

                if (new_mdlcpd_id not in mdlcpds_dict):
                    new_mdlcpd_dict = copy.deepcopy(default_mdlcpd_dict)
                    new_mdlcpd_dict['id'] = new_mdlcpd_id
                    new_mdlcpd_dict['name'] = MSD_compounds_dict[base_cpd_id][
                        'name']

                    new_mdlcpd_dict['charge'] = float(
                        MSD_compounds_dict[base_cpd_id]['charge'])
                    new_mdlcpd_dict['formula'] = MSD_compounds_dict[
                        base_cpd_id]['formula']
                    if(MSD_compounds_dict[base_cpd_id]['formula'] == "" or \
                           MSD_compounds_dict[base_cpd_id]['formula'] is None):
                        print("Formula: ", base_cpd_id,
                              MSD_compounds_dict[base_cpd_id])
                        new_mdlcpd_dict['formula'] = ""

                    new_mdlcpd_dict[
                        'compound_ref'] = '~/template/compounds/id/' + template_rgt_cpd
                    new_mdlcpd_dict[
                        'modelcompartment_ref'] = '~/modelcompartments/id/' + new_mdlcpt_id
                    mdlcpds_dict[new_mdlcpd_id] = new_mdlcpd_dict

                new_rgt_dict = {
                    'coefficient': template_rgt['coefficient'],
                    'modelcompound_ref': '~/modelcompounds/id/' + new_mdlcpd_id
                }

                new_mdlrxn_dict['modelReactionReagents'].append(new_rgt_dict)

            new_model_obj['modelreactions'].append(new_mdlrxn_dict)

        #Having populated with list of reactions and biomass (to come), then add all compartments and compounds
        for cpt_id in mdlcpts_dict:
            new_model_obj['modelcompartments'].append(mdlcpts_dict[cpt_id])

        #Last, but key modelcompound is the biomass, need to add it explicitly
        biocpd_id = "cpd11416"
        mdlbiocpd_dict = copy.deepcopy(default_mdlcpd_dict)
        mdlbiocpd_dict['id'] = biocpd_id + '_c0'
        mdlbiocpd_dict['name'] = 'Biomass'
        mdlbiocpd_dict['compound_ref'] = "~/template/compounds/id/" + biocpd_id
        mdlbiocpd_dict['modelcompartment_ref'] = "~/modelcompartments/id/c0"
        mdlcpds_dict[mdlbiocpd_dict['id']] = mdlbiocpd_dict

        for cpd_id in mdlcpds_dict:
            new_model_obj['modelcompounds'].append(mdlcpds_dict[cpd_id])

        default_biomass_dict = {
            'id': 'bio1',
            'name': 'Plant leaf biomass',
            'other': 1,
            'dna': 0,
            'rna': 0,
            'protein': 0,
            'cellwall': 0,
            'lipid': 0,
            'cofactor': 0,
            'energy': 0,
            'biomasscompounds': []
        }

        default_biocpd_dict = {'modelcompound_ref': '', 'coefficient': 0}

        for template_biomass in template_obj['data']['biomasses']:
            new_template_biomass = copy.deepcopy(default_biomass_dict)
            new_template_biomass['id'] = template_biomass['id']
            new_template_biomass['name'] = template_biomass['name']

            for entry in [
                    'dna', 'rna', 'protein', 'cellwall', 'lipid', 'cofactor',
                    'energy', 'other'
            ]:
                new_template_biomass[entry] = template_biomass[entry]

            for template_cpd in template_biomass['templateBiomassComponents']:
                new_biocpd_dict = copy.deepcopy(default_biocpd_dict)
                mdlcpd_id = template_cpd['templatecompcompound_ref'].split(
                    '/')[-1] + '0'
                if (mdlcpd_id not in mdlcpds_dict):
                    print("Missing: ", template_cpd)
                    continue
                new_biocpd_dict[
                    'modelcompound_ref'] = '~/modelcompounds/id/' + mdlcpd_id
                new_biocpd_dict['coefficient'] = template_cpd['coefficient']
                new_template_biomass['biomasscompounds'].append(
                    new_biocpd_dict)

            new_model_obj['biomasses'].append(new_template_biomass)

        print("Saving metabolic reconstruction")
        model_ws_object = {
            'type': 'KBaseFBA.FBAModel',
            'name': input_params['output_fbamodel'],
            'data': new_model_obj
        }

        if ('output_ws' not in input_params
                or input_params['output_ws'] == ''):
            input_params['output_ws'] = input_params['input_ws']

        ws_id = self.dfu.ws_name_to_id(input_params['output_ws'])
        saved_model_list = self.dfu.save_objects({
            'id': ws_id,
            'objects': [model_ws_object]
        })[0]

        #Compose report string
        html_string = "<html><head><title>Reconstruct Plant Metabolism Report</title></head><body>"
        html_string += "<h2>Reconstruct Plant Metabolism Report</h2>"
        html_string += "<p>The \"Reconstruct Plant Metabolism\" app has finished running, "
        html_string += "reconstructing the primary metabolism from the "
        html_string += "enzymatic annotations in " + input_params[
            'input_genome'] + "</p>"
        html_string += "<p>Below we present the table of compartmentalized reactions in the metabolic reconstruction, "
        html_string += "it is similar to what you can see in the FBAModel viewer widget that appears "
        html_string += "below the report, but it has some additional information. Each row in the table is unique "
        html_string += "to each combination of reaction and compartment.</p>"
        html_string += "<p><ul>"
        html_string += "<li><b>Subsystems and Classes:</b> The table contains the metabolic subsystems and "
        html_string += "the general class of metabolism they fall into.</li>"
        html_string += "<li><b>Metabolic functions and EC numbers:</b> The table contains the original enzymatic "
        html_string += "annotation ('Roles') and their EC numbers that were associated with each biochemical reaction.</li>"
        html_string += "<li><b>Complexes:</b> The table contains the genes that were annotated with the metabolic functions. "
        html_string += "These genes that are associated with each reaction can be seen in the FBAModel viewer widget, but here "
        html_string += " one can see how they may be organized into protein complexes. Each set of parentheses '()' "
        html_string += "represents a single protein subunit (which may be the entire enzyme, or part of a large enzymatic "
        html_string += "complex). Each set of square brackets '[]' represents an entire enzyme, regardless of how many "
        html_string += "subunits it consists of. Each reaction may be catalyzed by different enzymes, each in turn composed "
        html_string += "of different subunits. The complexes reflect how the enzymes were curated in <i>Arabidopsis thaliana</i> "
        html_string += " so if any complex is shown to be empty, this means that the enzymatic annotation was not propagated "
        html_string += "from the original Arabidopsis gene. The original Arabidopsis curation also included protein localization "
        html_string += "so if a reaction has empty complexes in some compartments as opposed to others, this is an indication "
        html_string += "that annotation was only propagated for some localized Arabidopsis enzymes, and not others."
        html_string += "</ul></p>"

        # Fetch PlantSEED Data
        with open(
                os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3",
                             "PlantSEED_Roles.json")) as plsd_fh:
            PS_Roles = json.load(plsd_fh)

        plantseed = FetchPlantSEEDImpl()
        reactions_data = plantseed.fetch_reactions(PS_Roles)

        table = GenerateTableImpl()
        table_html_string = table.generate_table(reactions_data,
                                                 complexes=rxncplxs_dict)

        with open(
                os.path.join(
                    '/kb/module/data', 'app_report_templates',
                    'integrate_abundances_report_tables_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Generate and insert html Title
        report_template_string = report_template_string.replace(
            '*TITLE*', input_params['output_fbamodel'])

        # Insert html table
        table_report_string = report_template_string.replace(
            '*TABLES*', html_string + table_html_string)

        #Make folder for report files
        uuid_string = str(uuid.uuid4())
        report_file_path = os.path.join(self.shared_folder, uuid_string)
        os.mkdir(report_file_path)

        #Write html files
        with open(os.path.join(report_file_path, "index.html"),
                  'w') as index_file:
            index_file.write(table_report_string)

        #Cache it in shock as an archive
        upload_info = self.dfu.file_to_shock({
            'file_path': report_file_path,
            'pack': 'zip'
        })

        #Prepare report parameters
        report_params = {
            'direct_html_link_index':
            0,  #Use to refer to index of 'html_links'
            'workspace_name': input_params['input_ws'],
            'report_object_name': 'plant_fba_' + uuid_string,
            'objects_created': [],
            'html_links': []
        }

        #Html Link object
        html_link = {
            'shock_id': upload_info['shock_id'],
            'name': 'index.html',
            'label': 'html files',
            'description': 'HTML files'
        }
        report_params['html_links'].append(html_link)

        #Objects created object
        saved_model_ref = "{}/{}/{}".format(saved_model_list[6],
                                            saved_model_list[0],
                                            saved_model_list[4])
        saved_model_desc = "FBAModel: " + input_params['output_fbamodel']
        report_params['objects_created'].append({
            'ref': saved_model_ref,
            'description': saved_model_desc
        })

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        report_client_output = kbase_report_client.create_extended_report(
            report_params)

        output_report = dict()
        output_report['report_name'] = report_client_output['name']
        output_report['report_ref'] = report_client_output['ref']

        #END reconstruct_plant_metabolism

        # At some point might do deeper type checking...
        if not isinstance(output_report, dict):
            raise ValueError(
                'Method reconstruct_plant_metabolism return value ' +
                'output_report is not type dict as required.')
        # return the results
        return [output_report]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
コード例 #8
0
class VariationUtil:
    '''
    Module Name:
    VariationUtil

    Module Description:
    A KBase module: VariationUtil
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.4"
    GIT_URL = ""
    GIT_COMMIT_HASH = "2a4c2dbc058b702811c967997e7100c834e755d4"

    #BEGIN_CLASS_HEADER

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        # TODO: Make sure we need to define config just once
        # TODO: Change the code tp match this style
        self.config = config
        self.config['SDK_CALLBACK_URL'] = os.environ['SDK_CALLBACK_URL']
        self.config['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.config['ws_url'] = config['workspace-url']

        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shared_folder = config['scratch']
        self.hr = htmlreportutils()
        self.ws_url = config['workspace-url']
        self.wsc = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.shock_url = config['shock-url']
        self.sw_url = config['srv-wiz-url']
        pass
        #END_CONSTRUCTOR
        pass

    def save_variation_from_vcf(self, ctx, params):
        """
        Save a variation (and trait?) object to Kbase given a reference genome, object output name,
        Variant Call Format (VCF) file, and sample attribute file.
        :param params: instance of type "save_variation_input" (## funcdef
           save_variation_from_vcf ## required input params:
           genome_or_assembly_ref: KBaseGenomes.Genome or
           KBaseGenomeAnnotations.Assembly object reference *** variation
           input data *** vcf_staging_file_path: path to location data
           associated with samples variation_object_name: output name for
           KBase variation object *** sample input data ***
           sample_attribute_ref: x/y/z reference to kbase sample attribute
           optional params: NA output report: report_name report_ref HTML
           visualization: Manhattan plot *** Visualization *** plot_maf:
           generate histogram of minor allele frequencies plot_hwe: generate
           histogram of Hardy-Weinberg Equilibrium p-values) -> structure:
           parameter "workspace_name" of String, parameter
           "genome_or_assembly_ref" of type "obj_ref" (An X/Y/Z style
           reference), parameter "vcf_staging_file_path" of type "filepath"
           (KBase file path to staging files), parameter
           "variation_object_name" of String, parameter
           "sample_attribute_ref" of type "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "save_variation_output" -> structure:
           parameter "variation_ref" of String, parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: report
        #BEGIN save_variation_from_vcf

        # Get workspace id
        ws_id = self.dfu.ws_name_to_id(params['workspace_name'])

        genome_ref = None
        assembly_ref = None

        # 1) Find whether the input is a genome or assembly
        #    and get genome_ref and assembly_ref

        genome_or_assembly_ref = params['genome_or_assembly_ref']
        obj_type = self.wsc.get_object_info3(
            {'objects': [{
                'ref': genome_or_assembly_ref
            }]})['infos'][0][2]
        if ('KBaseGenomes.Genome' in obj_type):
            genome_ref = genome_or_assembly_ref
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])
            assembly_ref = subset[0]['data']['assembly_ref']
        elif ('KBaseGenomeAnnotations.Assembly' in obj_type):
            assembly_ref = genome_or_assembly_ref
        else:
            raise ValueError(obj_type +
                             ' is not the right input for this method. ' +
                             'Valid input include KBaseGenomes.Genome or ' +
                             'KBaseGenomeAnnotations.Assembly ')

        # 2)  Validate VCF, compress, and build VCF index
        logging.info("Validating VCF, Compressing VCF and Indexing VCF")
        VCFUtilsConfig = {"scratch": self.scratch}
        VCFUtilsParams = {
            'vcf_staging_file_path': params['vcf_staging_file_path']
        }
        VCU = VCFUtils(VCFUtilsConfig)
        vcf_compressed, vcf_index, vcf_strain_ids = VCU.validate_compress_and_index_vcf(
            VCFUtilsParams)

        if vcf_index is not None:
            logging.info("vcf compressed :" + str(vcf_compressed))
            logging.info("vcf index :" + str(vcf_index))
            logging.info("vcf strain ids :" + str(vcf_strain_ids))
        else:
            raise ValueError(
                "No result obtained after compression and indexing step")

        # Get strain info
        # TODO: Remove hard coded stuff
        StrainInfoConfig = self.config
        StrainInfoParams = {
            "ws_id": ws_id,
            "vcf_strain_ids": vcf_strain_ids,
            "sample_set_ref": params["sample_set_ref"],
            "sample_attribute_name": params["sample_attribute_name"]
        }
        si = StrainInfo(StrainInfoConfig)
        sample_attribute_ref, strains = si.sample_strain_info(StrainInfoParams)
        print(sample_attribute_ref)
        print(strains)

        # 3) Create json for variation object. In a following step genomic_indexes will be
        # added to this json before it is saved as Variation object

        VCFToVariationConfig = {"ws_url": self.ws_url, "scratch": self.scratch}
        VCFToVariationParams = {
            "vcf_compressed": vcf_compressed,
            "vcf_index": vcf_index,
            "assembly_ref": assembly_ref
        }
        if genome_ref is not None:
            VCFToVariationParams['genome_ref'] = genome_ref

        vtv = VCFToVariation(VCFToVariationConfig)
        variation_object_data = vtv.generate_variation_object_data(
            VCFToVariationParams)
        # Append sample information
        if sample_attribute_ref:
            variation_object_data[
                'sample_attribute_ref'] = sample_attribute_ref
        else:
            raise ValueError(f'sample attribute ref not found')
        if strains:
            variation_object_data['strains'] = strains
        else:
            raise ValueError(f'strains not found')
        if 'sample_set_ref' in params:
            variation_object_data['sample_set_ref'] = params['sample_set_ref']
        else:
            raise ValueError(f'sample_set_ref not found in params')

        # 4)
        JbrowseConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
            "sw_url": self.sw_url,
            "shock_url": self.shock_url
        }
        JbrowseParams = {
            "vcf_path": vcf_compressed,
            "assembly_ref": assembly_ref,
            "binsize": 10000,
            "vcf_shock_id": variation_object_data['vcf_handle']['id'],
            "vcf_index_shock_id":
            variation_object_data['vcf_index_handle']['id']
        }
        if genome_ref is not None:
            JbrowseParams["genome_ref"] = genome_ref

        jb = JbrowseUtil(JbrowseConfig)
        jbrowse_report = jb.prepare_jbrowse_report(JbrowseParams)

        # 5) Now we have the genomic indices and we have all the information needed to save
        # the variation object
        # TODO: Take out the genomic_indexes field from the object spec
        #  TODO: Take out the vcf_handle stuff not needed

        variation_object_data['genomic_indexes'] = jbrowse_report[
            'genomic_indexes']

        var_obj = self.dfu.save_objects({
            'id':
            self.dfu.ws_name_to_id(params['workspace_name']),
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_object_data,
                'name': params['variation_object_name']
            }]
        })[0]

        var_obj_ref = str(var_obj[6]) + "/" + str(var_obj[0]) + "/" + str(
            var_obj[4])
        print(var_obj_ref)

        # 5) Build Variation report
        # This is a simple report
        #
        workspace = params['workspace_name']
        created_objects = []
        created_objects.append({
            "ref": var_obj_ref,
            "description": "Variation Object"
        })
        ReportConfig = {
            "ws_url": self.ws_url,
            "scratch": self.scratch,
        }
        ReportParams = {"variation_ref": var_obj_ref}
        vr = VariationReport(ReportConfig)
        htmlreport_dir = vr.create_variation_report(ReportParams)

        report = self.hr.create_html_report(htmlreport_dir, workspace,
                                            created_objects)
        report['variation_ref'] = var_obj_ref
        print(report)
        #END save_variation_from_vcf

        # At some point might do deeper type checking...
        if not isinstance(report, dict):
            raise ValueError('Method save_variation_from_vcf return value ' +
                             'report is not type dict as required.')
        # return the results
        return [report]

    def export_variation_as_vcf(self, ctx, params):
        """
        Export KBase variation object as Variant Call Format (VCF) file
        :param params: instance of type "export_variation_input" (## funcdef
           export_variation_as_vcf ## required input params: Variation object
           reference optional params: NA output report: Shock id pointing to
           exported vcf file) -> structure: parameter "input_var_ref" of type
           "obj_ref" (An X/Y/Z style reference)
        :returns: instance of type "export_variation_output" -> structure:
           parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_variation_as_vcf

        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        output = vtv.export_as_vcf(params)

        #END export_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_variation_as_vcf return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_variation_as_vcf(self, ctx, params):
        """
        Given a reference to a variation object, and output name: return a Variant Call Format (VCF)
        file path and name.
        :param params: instance of type "get_variation_input" (## funcdef
           get_variation_as_vcf ## required input params: Variation object
           reference output file name optional params: NA output report: path
           to returned vcf name of variation object) -> structure: parameter
           "variation_ref" of type "obj_ref" (An X/Y/Z style reference),
           parameter "filename" of String
        :returns: instance of type "get_variation_output" -> structure:
           parameter "path" of type "filepath" (KBase file path to staging
           files), parameter "variation_name" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN get_variation_as_vcf
        vtv = VariationToVCF(self.callback_url, self.shared_folder)
        file = vtv.variation_to_vcf(params)

        #END get_variation_as_vcf

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method get_variation_as_vcf return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
コード例 #9
0
class MatrixUtil:
    def _validate_import_matrix_from_excel_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        logging.info('start validating import_matrix_from_excel params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        refs = {k: v for k, v in params.items() if "_ref" in k}

        return (obj_type, file_path, params.get('workspace_name'),
                params.get('matrix_name'), refs, scale)

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {'file_path': file_path, 'pack': 'zip'}
        shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id')

        return shock_id

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    @staticmethod
    def _write_mapping_sheet(file_path, sheet_name, mapping, index):
        """
        _write_mapping_sheet: write mapping to sheet
        """
        df_dict = collections.OrderedDict()

        df_dict[index[0]] = []
        df_dict[index[1]] = []

        for key, value in mapping.items():
            df_dict.get(index[0]).append(key)
            df_dict.get(index[1]).append(value)

        df = pd.DataFrame.from_dict(df_dict)

        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            writer.book = load_workbook(file_path)
            df.to_excel(writer, sheet_name=sheet_name)

    def _generate_report(self, matrix_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': matrix_obj_ref,
                'description': 'Imported Matrix'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'import_matrix_from_excel_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _process_mapping_sheet(file_path, sheet_name):
        """
        _process_mapping: process mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name, dtype='str')
        except XLRDError:
            return dict()
        else:
            mapping = {value[0]: value[1] for value in df.values.tolist()}

        return mapping

    def _process_attribute_mapping_sheet(self, file_path, sheet_name,
                                         matrix_name, workspace_id):
        """
        _process_attribute_mapping_sheet: process attribute_mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return ''
        else:
            obj_name = f'{matrix_name}_{sheet_name}'
            result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            self._mkdir_p(result_directory)
            file_path = os.path.join(result_directory,
                                     '{}.xlsx'.format(obj_name))
            df.to_excel(file_path)
            import_attribute_mapping_params = {
                'output_obj_name': obj_name,
                'output_ws_id': workspace_id,
                'input_file_path': file_path
            }

            ref = self.attr_util.file_to_attribute_mapping(
                import_attribute_mapping_params)

            return ref.get('attribute_mapping_ref')

    @staticmethod
    def _file_to_df(file_path):
        logging.info('start parsing file content to data frame')

        try:
            df = pd.read_excel(file_path, sheet_name='data', index_col=0)

        except XLRDError:
            try:
                df = pd.read_excel(file_path, index_col=0)
                logging.warning(
                    'WARNING: A sheet named "data" was not found in the attached file,'
                    ' proceeding with the first sheet as the data sheet.')

            except XLRDError:

                try:
                    reader = pd.read_csv(file_path, sep=None, iterator=True)
                    inferred_sep = reader._engine.data.dialect.delimiter
                    df = pd.read_csv(file_path, sep=inferred_sep, index_col=0)
                except Exception:
                    raise ValueError(
                        'Cannot parse file. Please provide valide tsv, excel or csv file'
                    )

        df.index = df.index.astype('str')
        df.columns = df.columns.astype('str')
        # fill NA with "None" so that they are properly represented as nulls in the KBase Object
        df = df.where((pd.notnull(df)), None)

        return df

    def _file_to_data(self, file_path, refs, matrix_name, workspace_id):
        logging.info('Start reading and converting excel file data')
        data = refs

        df = self._file_to_df(file_path)

        matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': df.values.tolist()
        }

        data.update({'data': matrix_data})
        data.update(
            self._get_axis_attributes('col', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))
        data.update(
            self._get_axis_attributes('row', matrix_data, refs, file_path,
                                      matrix_name, workspace_id))

        # processing metadata
        metadata = self._process_mapping_sheet(file_path, 'metadata')
        data['attributes'] = {}
        data['search_attributes'] = []
        for k, v in metadata.items():
            k = k.strip()
            v = v.strip()
            if k in TYPE_ATTRIBUTES:
                data[k] = v
            else:
                data['attributes'][k] = v
                data['search_attributes'].append(" | ".join((k, v)))

        return data

    def _get_axis_attributes(self, axis, matrix_data, refs, file_path,
                             matrix_name, workspace_id):
        """Get the row/col_attributemapping and mapping of ids, validating as needed"""
        # Parameter specified mappings should take precedence over tabs in excel so only process
        # if attributemapping_ref is missing:
        attr_data = {}

        if refs.get(f'{axis}_attributemapping_ref'):
            attributemapping_ref = refs[f'{axis}_attributemapping_ref']
        else:
            attributemapping_ref = self._process_attribute_mapping_sheet(
                file_path, f'{axis}_attribute_mapping', matrix_name,
                workspace_id)

        if attributemapping_ref:
            attr_data[f'{axis}_attributemapping_ref'] = attributemapping_ref

        # col/row_mappings may not be supplied
        id_mapping = self._process_mapping_sheet(file_path, f'{axis}_mapping')
        if id_mapping:
            attr_data[f'{axis}_mapping'] = id_mapping
        # if no mapping, axis ids must match the attribute mapping
        elif attributemapping_ref:
            am_data = self.dfu.get_objects(
                {'object_refs': [attributemapping_ref]})['data'][0]['data']
            axis_ids = matrix_data[f'{axis}_ids']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                # just gen the IDs in this matrix
                attr_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return attr_data

    @staticmethod
    def _build_header_str(attribute_names):  #not going to be used

        header_str = ''
        width = 100.0 / len(attribute_names)

        header_str += '<tr class="header">'
        header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format(
            width)

        for attribute_name in attribute_names:
            header_str += '<th style="width:{0:.2f}%;"'.format(width)
            header_str += '>{}</th>'.format(attribute_name)
        header_str += '</tr>'

        return header_str

    def _build_html_str(self, row_mapping, attributemapping_data,
                        row_ids):  #not going to be used

        logging.info('Start building html replacement')

        attribute_names = [
            attributes.get('attribute')
            for attributes in attributemapping_data.get('attributes')
        ]

        header_str = self._build_header_str(attribute_names)

        table_str = ''

        instances = attributemapping_data.get('instances')

        for feature_id, attribute_id in row_mapping.items():
            if feature_id in row_ids:
                feature_instances = instances.get(attribute_id)

                table_str += '<tr>'
                table_str += '<td>{}</td>'.format(feature_id)

                for feature_instance in feature_instances:
                    table_str += '<td>{}</td>'.format(feature_instance)
                table_str += '</tr>'

        return header_str, table_str

    def _generate_search_html_report(self, header_str,
                                     table_str):  #generate search html report

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'kbase_icon.png'), output_directory)
        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'templates',
                         'search_icon.png'), output_directory)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'search_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '//HEADER_STR', header_str)
                report_template = report_template.replace(
                    '//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Search Matrix App'
        })

        return html_report

    def _generate_search_report(self, header_str, table_str, workspace_name):
        logging.info('Start creating report')

        output_html_files = self._generate_search_html_report(
            header_str, table_str)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name':
            'kb_matrix_filter_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    @staticmethod
    def _filter_value_data(value_data, remove_ids, dimension):
        """Filters a value matrix based on column or row ids"""
        def _norm_id(_id):
            return _id.replace(" ", "_")

        val_df = pd.DataFrame(value_data['values'],
                              index=value_data['row_ids'],
                              columns=value_data['col_ids'],
                              dtype='object')

        if dimension == 'row':
            filtered_df = val_df.drop(remove_ids, axis=0, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=0,
                                           errors='ignore')
        elif dimension == 'col':
            filtered_df = val_df.drop(remove_ids, axis=1, errors='ignore')
            filtered_df = filtered_df.drop([_norm_id(x) for x in remove_ids],
                                           axis=1,
                                           errors='ignore')
        else:
            raise ValueError('Unexpected dimension: {}'.format(dimension))

        filtered_value_data = {
            "values": filtered_df.values.tolist(),
            "col_ids": list(filtered_df.columns),
            "row_ids": list(filtered_df.index),
        }

        return filtered_value_data

    def _standardize_df(self, df, with_mean=True, with_std=True):

        logging.info("Standardizing matrix data")

        df.fillna(0, inplace=True)

        x_train = df.values

        scaler = preprocessing.StandardScaler(with_mean=with_mean,
                                              with_std=with_std).fit(x_train)

        standardized_values = scaler.transform(x_train)

        standardize_df = pd.DataFrame(index=df.index,
                                      columns=df.columns,
                                      data=standardized_values)

        return standardize_df

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]

    def standardize_matrix(self, params):
        """
        standardize a matrix
        """

        input_matrix_ref = params.get('input_matrix_ref')
        workspace_name = params.get('workspace_name')
        new_matrix_name = params.get('new_matrix_name')
        with_mean = params.get('with_mean', 1)
        with_std = params.get('with_std', 1)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        input_matrix_obj = self.dfu.get_objects(
            {'object_refs': [input_matrix_ref]})['data'][0]
        input_matrix_info = input_matrix_obj['info']
        input_matrix_name = input_matrix_info[1]
        input_matrix_data = input_matrix_obj['data']

        if not new_matrix_name:
            current_time = time.localtime()
            new_matrix_name = input_matrix_name + time.strftime(
                '_%H_%M_%S_%Y_%m_%d', current_time)

        data_matrix = self.data_util.fetch_data({
            'obj_ref': input_matrix_ref
        }).get('data_matrix')
        df = pd.read_json(data_matrix)

        standardize_df = self._standardize_df(df, with_mean, with_std)

        new_matrix_data = {
            'row_ids': df.index.tolist(),
            'col_ids': df.columns.tolist(),
            'values': standardize_df.values.tolist()
        }

        input_matrix_data['data'] = new_matrix_data

        logging.info("Saving new standardized matrix object")
        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": input_matrix_info[2],
                "data": input_matrix_data,
                "name": new_matrix_name
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_matrix_obj_ref,
            'description': 'Standardized Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def filter_matrix(self, params):  #not going to be used
        """
        filter_matrix: create sub-matrix based on input feature_ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        feature_ids: string of feature ids that result matrix contains
        filtered_matrix_name: name of newly created filtered matrix object
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')
        remove_ids = params.get('remove_ids')
        dimension = params.get('dimension')
        filtered_matrix_name = params.get('filtered_matrix_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_info = matrix_source.get('info')
        matrix_data = matrix_source.get('data')

        matrix_type = self._find_between(matrix_info[2], '\.', '\-')

        value_data = matrix_data.get('data')
        remove_ids = [x.strip() for x in remove_ids.split(',')]
        filtered_value_data = self._filter_value_data(value_data, remove_ids,
                                                      dimension)

        # if the matrix has changed shape, update the mappings
        if len(filtered_value_data['row_ids']) < len(
                matrix_data['data']['row_ids']):
            if matrix_data.get('row_mapping'):
                matrix_data['row_mapping'] = {
                    k: matrix_data['row_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }
            if matrix_data.get('feature_mapping'):
                matrix_data['feature_mapping'] = {
                    k: matrix_data['feature_mapping'][k]
                    for k in filtered_value_data['row_ids']
                }

        if len(filtered_value_data['col_ids']) < len(
                matrix_data['data']['col_ids']):
            if matrix_data.get('col_mapping'):
                matrix_data['col_mapping'] = {
                    k: matrix_data['col_mapping'][k]
                    for k in filtered_value_data['col_ids']
                }
        matrix_data['data'] = filtered_value_data

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        filtered_matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(matrix_type),
            'obj_name':
            filtered_matrix_name,
            'data':
            matrix_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]}

        report_output = self._generate_report(filtered_matrix_obj_ref,
                                              workspace_name)

        returnVal.update(report_output)

        return returnVal

    def search_matrix(self, params):  #not going to be used
        """
        search_matrix: generate a HTML report that allows users to select feature ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_data = matrix_source.get('data')

        row_mapping = matrix_data.get('row_mapping')
        row_attributemapping_ref = matrix_data.get('row_attributemapping_ref')

        row_ids = matrix_data['data']['row_ids']

        if not (row_mapping and row_attributemapping_ref):
            raise ValueError(
                'Matrix obejct is missing either row_mapping or row_attributemapping_ref'
            )

        attributemapping_data = self.dfu.get_objects(
            {"object_refs": [row_attributemapping_ref]})['data'][0]['data']

        header_str, table_str = self._build_html_str(row_mapping,
                                                     attributemapping_data,
                                                     row_ids)

        returnVal = self._generate_search_report(header_str, table_str,
                                                 workspace_name)

        return returnVal

    def import_matrix_from_excel(self, params):
        """
        import_matrix_from_excel: import matrix object from excel

        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (obj_type, file_path, workspace_name, matrix_name, refs,
         scale) = self._validate_import_matrix_from_excel_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path, refs, matrix_name, workspace_id)
        data['scale'] = scale
        if params.get('description'):
            data['description'] = params['description']

        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def export_matrix(self, params):
        """
        export_matrix: univeral downloader for matrix data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: select the generics data to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        and only data is needed
                        generics_module should be
                        {'data': 'FloatMatrix2D'}
        """
        logging.info('Start exporting matrix')

        if 'input_ref' in params:
            params['obj_ref'] = params.pop('input_ref')

        obj_source = self.dfu.get_objects(
            {"object_refs": [params.get('obj_ref')]})['data'][0]
        obj_data = obj_source.get('data')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory,
                                 '{}.xlsx'.format(obj_source.get('info')[1]))

        data_matrix = self.data_util.fetch_data(params).get('data_matrix')
        df = pd.read_json(data_matrix)

        df.to_excel(file_path, sheet_name='data')

        if obj_data.get('col_mapping'):
            self._write_mapping_sheet(file_path, 'col_mapping',
                                      obj_data.get('col_mapping'),
                                      ['col_name', 'instance_name'])
            obj_data.pop('col_mapping')

        if obj_data.get('row_mapping'):
            self._write_mapping_sheet(file_path, 'row_mapping',
                                      obj_data.get('row_mapping'),
                                      ['row_name', 'instance_name'])
            obj_data.pop('row_mapping')

        try:
            obj_data.pop('data')
        except KeyError:
            logging.warning('Missing key [data]')

        obj_data.update(obj_data.get('attributes', {}))  # flatten for printing
        self._write_mapping_sheet(file_path, 'metadata', obj_data,
                                  ['name', 'value'])

        shock_id = self._upload_to_shock(file_path)

        return {'shock_id': shock_id}
コード例 #10
0
def UploadFrommfmd(callback_url, params):
        """
        :param params: instance of type "UploadmfmdInParams" -> structure:
           parameter "path" of String, parameter "ws_name" of String,
           parameter "obj_name" of String
        :returns: instance of type "UploadOutput" -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFrommfmd
        print('Extracting motifs')
        #motifList = MFU.parse_mfmd_output(params['path'])
        motifList = parse_mfmd_output(params['path'])
        print(motifList)
       
        MSO = {}
        MSO=motifList
        '''MSO['Condition'] = 'Temp'
        MSO['SequenceSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = MSU.GetBackground()
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0
        
        #MSU.parseMotifList(motifList,MSO)'''
        
        '''params['min_len']=22   #put dummy value for min and max len
        params['max_len']=22
        #MSU.CheckLength(motifList,params['min_len'],params['max_len'])
        #MSU.CheckLength(MSO,params['min_len'],params['max_len'])
        
        
        for motif in MSO['Motifs']:
            print()
            for letter in MSO['Alphabet']:
                if len(motif['PWM'][letter]) != len(motif['Iupac_sequence']):
                    print('CAUGHT PWM ERROR HERE')
                    exit(1)
        if 'absolute_locations' in params:
            for motif in MSO['Motifs']:
                for loc in motif['Motif_Locations']:
                    if loc['sequence_id'] in params['absolute_locations']:
                        loc['sequence_id'] = params['contig']
                        absStart = int(params['start'])
                        loc['start'] = absStart
                        loc['end'] = absStart + loc['end']
        print("test2")'''
        
        dfu = DataFileUtil(callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : MSO , 'name' : params['obj_name']}]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref' : motif_set_ref}
        print(output)

        
        #exit("test")
        #END UploadFrommfmd

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFrommfmd return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #11
0
class AttributesUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.data_util = DataUtil(config)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"
        self.ONT_LABEL_DEL = " - "
        self.ONT_TERM_DEL = ":"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def file_to_attribute_mapping(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        attr_mapping = self._file_to_am_obj(scratch_file_path)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": attr_mapping,
                "name": params['output_obj_name']
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def append_file_to_attribute_mapping(self,
                                         staging_file_subdir_path,
                                         old_am_ref,
                                         output_ws_id,
                                         new_am_name=None):
        """append an attribute mapping file to existing attribute mapping object
        """

        download_staging_file_params = {
            'staging_file_subdir_path': staging_file_subdir_path
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        append_am_data = self._file_to_am_obj(scratch_file_path)

        old_am_obj = self.dfu.get_objects({'object_refs':
                                           [old_am_ref]})['data'][0]

        old_am_info = old_am_obj['info']
        old_am_name = old_am_info[1]
        old_am_data = old_am_obj['data']

        new_am_data = self._check_and_append_am_data(old_am_data,
                                                     append_am_data)

        if not new_am_name:
            current_time = time.localtime()
            new_am_name = old_am_name + time.strftime('_%H_%M_%S_%Y_%m_%d',
                                                      current_time)

        info = self.dfu.save_objects({
            "id":
            output_ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": new_am_data,
                "name": new_am_name
            }]
        })[0]
        return {
            "attribute_mapping_ref": "%s/%s/%s" % (info[6], info[0], info[4])
        }

    def update_matrix_attribute_mapping(self, params):

        dimension = params.get('dimension')
        if dimension not in ['col', 'row']:
            raise ValueError('Please use "col" or "row" for input dimension')

        workspace_name = params.get('workspace_name')

        old_matrix_ref = params.get('input_matrix_ref')
        old_matrix_obj = self.dfu.get_objects(
            {'object_refs': [old_matrix_ref]})['data'][0]
        old_matrix_info = old_matrix_obj['info']
        old_matrix_data = old_matrix_obj['data']

        old_am_ref = old_matrix_data.get(
            '{}_attributemapping_ref'.format(dimension))

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        if not old_am_ref:
            raise ValueError(
                'Matrix object does not have {} attribute mapping'.format(
                    dimension))

        new_am_ref = self.append_file_to_attribute_mapping(
            params['staging_file_subdir_path'], old_am_ref, workspace_id,
            params['output_am_obj_name'])['attribute_mapping_ref']

        old_matrix_data['{}_attributemapping_ref'.format(
            dimension)] = new_am_ref

        info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "type": old_matrix_info[2],
                "data": old_matrix_data,
                "name": params['output_matrix_obj_name']
            }]
        })[0]

        new_matrix_obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        objects_created = [{
            'ref': new_am_ref,
            'description': 'Updated Attribute Mapping'
        }, {
            'ref': new_matrix_obj_ref,
            'description': 'Updated Matrix'
        }]

        report_params = {
            'message': '',
            'objects_created': objects_created,
            'workspace_name': workspace_name,
            'report_object_name':
            'import_matrix_from_biom_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        return {
            'new_matrix_obj_ref': new_matrix_obj_ref,
            'new_attribute_mapping_ref': new_am_ref,
            'report_name': output['name'],
            'report_ref': output['ref']
        }

    def _check_and_append_am_data(self, old_am_data, append_am_data):

        exclude_keys = {'attributes', 'instances'}
        new_am_data = {
            k: old_am_data[k]
            for k in set(list(old_am_data.keys())) - exclude_keys
        }

        old_attrs = old_am_data.get('attributes')
        old_insts = old_am_data.get('instances')

        append_attrs = append_am_data.get('attributes')
        append_insts = append_am_data.get('instances')

        # checking duplicate attributes
        old_attrs_names = [old_attr.get('attribute') for old_attr in old_attrs]
        append_attrs_names = [
            append_attr.get('attribute') for append_attr in append_attrs
        ]

        duplicate_attrs = set(old_attrs_names).intersection(append_attrs_names)

        if duplicate_attrs:
            error_msg = 'Duplicate attribute mappings: [{}]'.format(
                duplicate_attrs)
            raise ValueError(error_msg)

        # checking missing instances
        missing_inst = old_insts.keys() - append_insts.keys()

        if missing_inst:
            error_msg = 'Appended attribute mapping misses [{}] instances'.format(
                missing_inst)
            raise ValueError(error_msg)

        new_attrs = old_attrs + append_attrs
        new_am_data['attributes'] = new_attrs

        new_insts = deepcopy(old_insts)

        for inst_name, val in new_insts.items():
            append_val = append_insts.get(inst_name)
            val.extend(append_val)

        new_am_data['instances'] = new_insts

        return new_am_data

    def _am_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        attributes = pd.DataFrame(data['attributes'])
        attributes.rename(columns=lambda x: x.replace("ont", "ontology").
                          capitalize().replace("_", " "))
        instances = pd.DataFrame(data['instances'])
        am_df = attributes.join(instances)

        return am_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.data_util.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            list(cluster.get('id_to_data_position').keys())
            for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on columns
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = list(cluster.get('id_to_data_position').keys())
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a DataFrame"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.AttributeMapping" in obj_type:
            cs_df = self._am_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.AttributeMapping or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _file_to_am_obj(self, scratch_file_path):
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep=None, dtype='str')
        df = df.replace('nan', '')
        if df.columns[1].lower() == "attribute ontology id":
            am_obj = self._df_to_am_obj(df)
        else:
            am_obj = self._isa_df_to_am_object(df)
        return am_obj

    def _df_to_am_obj(self, am_df):
        """Converts a dataframe from a user file to a compound set object"""
        if not len(am_df):
            raise ValueError("No attributes in supplied files")

        attribute_df = am_df.filter(regex="[Uu]nit|[Aa]ttribute")
        instance_df = am_df.drop(attribute_df.columns, axis=1)
        if not len(instance_df.columns):
            raise ValueError(
                "Unable to find any instance columns in supplied file")

        attribute_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "attribute" not in attribute_df.columns:
            raise ValueError(
                "Unable to find a 'attribute' column in supplied file")
        attribute_df['source'] = 'upload'
        attribute_fields = ('attribute', 'unit', 'attribute_ont_id',
                            'unit_ont_id', 'source')
        attributes = attribute_df.filter(
            items=attribute_fields).to_dict('records')
        print(attributes)
        self._validate_attribute_values(
            am_df.set_index(attribute_df.attribute).iterrows())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation",
            'attributes': [self._add_ontology_info(f) for f in attributes],
            'instances': instance_df.to_dict('list')
        }

        return attribute_mapping

    def _isa_df_to_am_object(self, isa_df):
        skip_columns = {
            'Raw Data File', 'Derived Data File', 'Array Data File',
            'Image File'
        }
        if 'Sample Name' in isa_df.columns and not any(
                isa_df['Sample Name'].duplicated()):
            isa_df.set_index('Sample Name', inplace=True)
        elif 'Assay Name' in isa_df.columns and not any(
                isa_df['Assay Name'].duplicated()):
            isa_df.set_index('Assay Name', inplace=True)
        elif not any(isa_df[isa_df.columns[0]].duplicated()):
            logging.warning(f'Using {isa_df.columns[0]} as ID column')
            isa_df.set_index(isa_df.columns[0], inplace=True)
        else:
            raise ValueError(
                "Unable to detect an ID column that was unigue for each row. "
                f"Considered 'Sample Names', 'Assay Names' and {isa_df.columns[0]}"
            )
        self._validate_attribute_values(isa_df.iteritems())

        attribute_mapping = {
            'ontology_mapping_method': "User Curation - ISA format"
        }
        attribute_mapping[
            'attributes'], new_skip_cols = self._get_attributes_from_isa(
                isa_df, skip_columns)
        reduced_isa = isa_df.drop(columns=new_skip_cols, errors='ignore')
        attribute_mapping['instances'] = reduced_isa.T.to_dict('list')

        return attribute_mapping

    def _validate_attribute_values(self, attribute_series):
        errors = {}
        for attr, vals in attribute_series:
            try:
                validator = getattr(AttributeValidation, attr)
                attr_errors = validator(vals)
                if attr_errors:
                    errors[attr] = attr_errors
            except AttributeError:
                continue

        if errors:
            for attr, attr_errors in errors.items():
                logging.error(
                    f'Attribute {attr} had the following validation errors:\n'
                    "\n".join(attr_errors) + '\n')
                raise ValueError(
                    f'The following attributes failed validation: {", ".join(errors)}'
                    f'\n See the log for details')

    def _get_attributes_from_isa(self, isa_df, skip_columns):
        attributes = []
        # associate attribute columns with the other columns that relate to them
        for i, col in enumerate(isa_df.columns):
            if col.startswith('Term Source REF'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_ont'] = col
                else:
                    last_attr['_val_ont'] = col

            elif col.startswith('Term Accession Number'):
                # If the term Accession is a web link only grab the last bit
                # Similarly, sometimes the number is prefixed with the term source e.x. UO_0000012
                isa_df[col] = isa_df[col].map(
                    lambda x: x.split("/")[-1].split("_")[-1])
                skip_columns.add(col)
                last_attr = attributes[-1]
                if '_unit' in last_attr:
                    last_attr['_unit_accession'] = col
                else:
                    last_attr['_val_accession'] = col

            elif col.startswith('Unit'):
                skip_columns.add(col)
                last_attr = attributes[-1]
                if last_attr.get('unit'):
                    raise ValueError(
                        "More than one unit column is supplied for attribute {}"
                        .format(last_attr['attribute']))
                last_attr['_unit'] = col

            elif col not in skip_columns:
                split_col = col.split("|", 1)
                if len(split_col) > 1:
                    attributes.append({
                        "attribute": split_col[0],
                        "attribute_ont_id": split_col[1],
                        "source": "upload"
                    })
                else:
                    attributes.append({"attribute": col, "source": "upload"})

        # handle the categories for each attribute
        for i, attribute in enumerate(attributes):
            if '_val_accession' in attribute:
                category_df = isa_df[[
                    attribute['attribute'],
                    attribute.pop('_val_ont'),
                    attribute.pop('_val_accession')
                ]].drop_duplicates()
                category_df[
                    'attribute_ont_id'] = category_df.iloc[:, 1].str.cat(
                        category_df.iloc[:, 2], ":")
                category_df['value'] = category_df[attribute['attribute']]
                cats = category_df.set_index(attribute['attribute'])[[
                    'value', 'attribute_ont_id'
                ]].to_dict('index')
                attribute['categories'] = {
                    k: self._add_ontology_info(v)
                    for k, v in cats.items()
                }

            if '_unit' in attribute:
                units = isa_df[attribute.pop('_unit')].unique()
                if len(units) > 1:
                    raise ValueError(
                        "More than one unit type is supplied for attribute {}: {}"
                        .format(attribute['attribute'], units))
                attribute['unit'] = units[0]
                if '_unit_ont' in attribute:
                    unit_ont = isa_df[attribute.pop('_unit_ont')].str.cat(
                        isa_df[attribute.pop('_unit_accession')],
                        ":").unique()
                    if len(units) > 1:
                        raise ValueError(
                            "More than one unit ontology is supplied for attribute "
                            "{}: {}".format(attribute['attribute'], unit_ont))
                    attribute['unit_ont_id'] = unit_ont[0]
            attributes[i] = self._add_ontology_info(attribute)
        return attributes, skip_columns

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, attribute):
        """Searches KBASE ontologies for terms matching the user supplied attributes and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        attribute = {
            k: v
            for k, v in attribute.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            attribute.get('attribute_ont_id', "").replace("_", ":"))
        if ont_info:
            attribute['attribute_ont_ref'] = ont_info['ontology_ref']
            attribute['attribute_ont_id'] = ont_info['id']
        elif not attribute.get(
                'attribute_ont_id') or attribute['attribute_ont_id'] == ":":
            attribute.pop('attribute_ont_id', None)

        if attribute.get('unit'):
            ont_info = self._search_ontologies(
                attribute.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                attribute['unit_ont_ref'] = ont_info['ontology_ref']
                attribute['unit_ont_id'] = ont_info['id']
            elif not attribute.get(
                    'attribute_ont_id') or attribute['unit_ont_id'] == ":":
                attribute.pop('unit_ont_id', None)

        return attribute

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.AttributeMapping" in obj_type:
            df.to_excel(writer, "Attributes", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
コード例 #12
0
    def run_FamaGenomeProfiling(self, ctx, params):
        """
        Run genome functional profiling module of Fama.
        :param params: instance of type "FamaGenomeProfilingParams"
           (Parameters for genome functional profiling. workspace_name - the
           name of the workspace for input/output genome_refs - references to
           a genome object ref_dataset - the name of Fama reference dataset
           output_result_name - the name of the output DomainAnnotation) ->
           structure: parameter "workspace_name" of String, parameter
           "genome_ref" of list of String, parameter "ref_dataset" of String,
           parameter "output_feature_set_name" of String, parameter
           "output_annotation_name" of String
        :returns: instance of type "ReportResults" (Output report parameters
           report_name - the name of the report object report_ref - the
           reference to the report object) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_FamaGenomeProfiling
        # Import protein sequences from input genome_ref
        ws_client = Workspace(self.ws_url)
        input_genome_refs = params['genome_ref']
        fama_reference = params['ref_dataset']
        input_proteins = {}
        name2ref = {}
        for input_genome_ref in input_genome_refs:
            ret = ws_client.get_objects2(
                {'objects': [{
                    'ref': input_genome_ref
                }]})['data'][0]
            obj_data = ret['data']
            obj_name = ret['info'][1]
            obj_type = ret['info'][2].split('.')[1].split('-')[0]
            if obj_type == 'GenomeSet':
                print('GenomeSet data', obj_data)
                genome_refs = []
                if 'elements' in obj_data:
                    genome_refs = [
                        item['ref'] for item in obj_data['elements'].values()
                    ]
                elif 'items' in obj_data:
                    genome_refs = [item['ref'] for item in obj_data['items']]
                for sub_obj_ref in genome_refs:
                    ret = ws_client.get_objects2(
                        {'objects': [{
                            'ref': sub_obj_ref
                        }]})['data'][0]
                    genome_data = ret['data']
                    genome_name = ret['info'][1]
                    if genome_name in name2ref:
                        raise ServerError(
                            'All input genome names must be unique. Check ' +
                            genome_name)
                    name2ref[genome_name] = sub_obj_ref
                    proteins = genome_proteins_to_fasta(
                        genome_data, self.shared_folder)
                    input_proteins[genome_name] = {}
                    input_proteins[genome_name]['fwd'] = proteins
            elif obj_type == 'Genome':
                if obj_name in name2ref:
                    raise ServerError('All input genome names must be unique')
                name2ref[obj_name] = input_genome_ref
                proteins = genome_proteins_to_fasta(obj_data,
                                                    self.shared_folder)
                input_proteins[obj_name] = {}
                input_proteins[obj_name]['fwd'] = proteins
            else:
                raise ServerError('Incompatible object: ' + input_genome_ref +
                                  ' (' + obj_name + ')')

        self.log('Input sequence files:', str(input_proteins))
        self.log('reference: ', fama_reference)
        # Run Fama
        fama_params = {
            'input_proteins': input_proteins,
            'work_dir': self.shared_folder,
            'reference': fama_reference,
            'ws_name': params['workspace_name'],
            'ws_client': ws_client,
            'featureset_name': params['output_feature_set_name'],
            'annotation_prefix': params['output_annotation_name'],
            'name2ref': name2ref
        }
        fama_output = protein_functional_profiling_pipeline(fama_params)
        objects_created = fama_output['objects_created']

        dfu = DataFileUtil(self.callback_url)
        workspace_id = dfu.ws_name_to_id(params['workspace_name'])

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': fama_output['feature_set_data'],
                'name': params['output_feature_set_name']
            }]
        }

        try:
            dfu_oi = dfu.save_objects(save_object_params)[0]
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception saving feature set')
            self.log(str(dfue))
            raise
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0],
                                                dfu_oi[4])
        objects_created.append({
            'ref': feature_set_obj_ref,
            'description': 'Filtered genome features'
        })

        self.log('FeatureSet saved to ' + feature_set_obj_ref)

        # Write HTML output to workspace
        message = 'Fama protein functional profiling finished successfully'

        try:
            dfu_output = dfu.file_to_shock(
                {'file_path': fama_output['html_report']})
        except ServerError as dfue:
            # not really any way to test this block
            self.log('Logging exception loading results to shock')
            self.log(str(dfue))
            raise
        self.log('HTML report saved: ' + str(dfu_output))

        html_links = [{
            'shock_id': dfu_output['shock_id'],
            'description': 'HTML report for Fama App',
            'name': 'fama_report.html',
            'label': 'Fama_report'
        }]
        for krona_file in fama_output['krona_charts']:
            try:
                dfu_output = dfu.file_to_shock({'file_path': krona_file})
                html_links.append({
                    'shock_id':
                    dfu_output['shock_id'],
                    'description':
                    'Krona chart for function taxonomy profile',
                    'name':
                    fama_output['krona_charts'][krona_file][0],
                    'label':
                    fama_output['krona_charts'][krona_file][1]
                })
            except ServerError as dfue:
                # not really any way to test this block
                self.log('Logging exception loading results to shock')
                self.log(str(dfue))
                raise

        self.log('Krona chart saved: ' + str(dfu_output))

        # Save report
        report_params = {
            'message': message,
            'objects_created': objects_created,
            'direct_html_link_index': 0,
            'html_links': html_links,
            'file_links': fama_output['report_files'],
            'report_object_name': 'fama_profiling_report_' + str(uuid.uuid4()),
            'workspace_name': params['workspace_name'],
            'html_window_height': 460
        }
        try:
            self.log('Call KBaseReport at ' + str(self.callback_url))
            report = KBaseReport(self.callback_url)
            self.log('Ready to save KBase report: ' + str(report_params))
            report_info = report.create_extended_report(report_params)
        except ServerError as kre:
            # not really any way to test this block
            self.log('Logging exception saving report')
            self.log(str(kre))
            raise

        report_info['report_params'] = report_params
        self.log('KBase report saved: ' + str(report_info))
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_FamaGenomeProfiling

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_FamaGenomeProfiling return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #13
0
ファイル: MotifSuiteImpl.py プロジェクト: man4ish/MotifSuite
  def run_MotifSuite(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of type "motifsuite_seq_input" -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "SS_ref" of String, parameter "promoter_length"
           of Long, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long, parameter "obj_name" of String,
           parameter "prb" of Double, parameter "motif_length" of Long,
           parameter "background" of Long, parameter "mask_repeats" of Long,
           parameter "background_group" of mapping from String to String,
           parameter "threshold" of Double, parameter "proportion" of Double
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_MotifSuite
        report = KBaseReport(self.callback_url)
        mfmd_obj = MotifFindermfmd(self.callback_url)
        homer_obj = MotifFinderHomer(self.callback_url)
        meme_obj =  MotifFinderMEME(self.callback_url)
        gibbs_obj = MotifFinderGibbs(self.callback_url)
        ensemble_obj = MotifEnsemble(self.callback_url)
        mdscan_obj = MotifFinderMdscan(self.callback_url)
        sampler_obj =  MotifFinderSampler(self.callback_url)
        
        p1 = Process(target=homer_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p1.start()
        p1.join()

        p2 = Process(target=mfmd_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p2.start()
        p2.join()

        p3 = Process(target=meme_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p3.start()
        p3.join()
        
        p4 = Process(target=gibbs_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p4.start()
        p4.join()

        p5 = Process(target=mdscan_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p5.start()
        p5.join()

        p6 = Process(target=sampler_obj.DiscoverMotifsFromSequenceSet, args=(params,))
        p6.start()
        p6.join()
 
        
        MSU=MotifSuiteUtil()
        params['motifset_refs']= MSU.get_obj_refs()
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133','29716/72/134','29716/72/135','29716/72/136']
        #params['motifset_refs'] =['29716/72/131','29716/72/132','29716/72/133']
        print(params['motifset_refs'])
        #result = ensemble_obj.MotifEnsemble(params)
        #print('Ensemble RESULT:')
        #print(result)


        dms=DownloadMotifSets()
        MotifSetDict = dms.DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])
        fmu=FastaUtils()
        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if fmu.CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))

        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = fmu.merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{'type': 'KBaseGeneRegulation.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        mr=MakeNewReport()
        mr.MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')


        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        report_info = report.create_extended_report(reportObj)
        output = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        
        #END run_MotifSuite

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_MotifSuite return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #14
0
ファイル: BIOMUtil.py プロジェクト: man4ish/GenericsAPI
class BiomUtil:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.data_util = DataUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.matrix_types = [x.split(".")[1].split('-')[0]
                             for x in self.data_util.list_generic_types()]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """
        #exit(params)  {'obj_type': 'AmpliconMatrix', 'matrix_name': 'test_AmpliconMatrix', 'workspace_name': 'man4ish_gupta:narrative_1568644342277', 'biom_fasta': {'biom_file_biom_fasta': 'data/phyloseq_test.biom', 'fasta_file_biom_fasta': 'data/phyloseq_test.fa'}, 'scale': 'raw', 'description': 'OTU data', 'amplicon_set_name': 'test_AmpliconSet', 'col_attributemapping_ref': '44071/33/54'}

        (biom_file, tsv_file, fasta_file, mode, metadata_keys) = self._process_params(params)

        workspace_name = params.get('workspace_name')
        matrix_name = params.get('matrix_name')
        amplicon_set_name = params.get('amplicon_set_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file, mode, refs, matrix_name,
                                                    workspace_id, scale, description, metadata_keys)

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseMatrices.{}'.format(obj_type),
                                                'obj_name': matrix_name,
                                                'data': amplicon_data,
                                                'workspace_name': workspace_id})['obj_ref']

        amplicon_set_data = self._file_to_amplicon_set_data(biom_file, tsv_file, fasta_file, mode,
                                                            refs, description, matrix_obj_ref)

        logging.info('start saving AmpliconSet object: {}'.format(amplicon_set_name))
        amplicon_set_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseExperiments.AmpliconSet',
                                                'obj_name': amplicon_set_name,
                                                'data': amplicon_set_data,
                                                'workspace_name': workspace_id})['obj_ref']

        logging.info('start resaving Matrix object with amplicon set: {}'.format(matrix_name))
        amplicon_data['amplicon_set_ref'] = '{}/{}'.format(workspace_id, amplicon_set_name)
        matrix_obj_ref = self.data_util.save_object({
                                                'obj_type': 'KBaseMatrices.{}'.format(obj_type),
                                                'obj_name': matrix_name,
                                                'data': amplicon_data,
                                                'workspace_name': workspace_id})['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref,
                     'amplicon_set_obj_ref': amplicon_set_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, amplicon_set_obj_ref,
                                              new_row_attr_ref, new_col_attr_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name', 'scale', 'amplicon_set_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        if params.get('biom_tsv'):
            biom_tsv = params.get('biom_tsv')
            biom_file = biom_tsv.get('biom_file_biom_tsv')
            tsv_file = biom_tsv.get('tsv_file_biom_tsv')

            if not (biom_file and tsv_file):
                raise ValueError('missing BIOM or TSV file')

            biom_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': biom_file}).get('copy_file_path')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')
            mode = 'biom_tsv'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            biom_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': biom_file}).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': fasta_file}).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')

            fasta_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': fasta_file}).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [x.strip() for x in metadata_keys_str.split(',')]
            mode = 'tsv_fasta'
        elif params.get('tsv'):
            tsv = params.get('tsv')
            tsv_file = tsv.get('tsv_file_tsv')

            if not tsv_file:
                raise ValueError('missing TSV file')

            tsv_file = self.dfu.download_staging_file(
                                {'staging_file_subdir_path': tsv_file}).get('copy_file_path')

            metadata_keys_str = tsv.get('metadata_keys_tsv')
            if metadata_keys_str:
                metadata_keys += [x.strip() for x in metadata_keys_str.split(',')]

            mode = 'tsv'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode, list(set(metadata_keys)))

    def _retrieve_value(self, biom_metadata_dict, tsv_metadata_df, key, required=False):

        #exit(tsv_metadata_df)  defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fdb3037f378>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})
        #exit(key) taxonomy
        #exit(biom_metadata_dict) none
        if key in biom_metadata_dict:
            return {k.lower(): v for k, v in biom_metadata_dict.items()}.get(key)
        elif key in tsv_metadata_df:
            return {k.lower(): v for k, v in tsv_metadata_df.items()}.get(key)
        elif required:
            raise ValueError('missing necessary [{}] from file'.format(key))
        else:
            return None

    def _search_taxon(self, scientific_name):
        """
        logic borrowed from: GFU.GenomeInterface
        https://github.com/kbaseapps/GenomeFileUtil/blob/master/lib/GenomeFileUtil/core/GenomeInterface.py#L216
        """
        taxon_id = None

        search_params = {
            "object_types": ["taxon"],
            "match_filter": {
                "lookup_in_keys": {
                    "scientific_name": {"value": scientific_name}},
                "exclude_subobjects": 1
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "sorting_rules": [{
                "is_object_property": 0,
                "property": "timestamp",
                "ascending": 0
            }]
        }

        objects = self.kbse.search_objects(search_params)['objects']

        if not objects:
            search_params['match_filter']['lookup_in_keys'] = {
                "aliases": {"value": scientific_name}
            }
            objects = self.kbse.search_objects(search_params)['objects']
        if objects:
            taxon_id = objects[0].get('object_name')
        #exit(taxon_id)  561_taxon
        return taxon_id

    def _fetch_taxon_level(self, taxon_char):

        taxon_level_mapping = {'l': 'Life', 'd': 'Domain', 'k': 'Kingdom', 'p': 'Phylum',
                               'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus',
                               's': 'Species'}
        return taxon_level_mapping.get(taxon_char[0].lower(), 'Unknown')

    def _fetch_taxonomy(self, datarow):
        #exit(datarow) defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7f7ca8e8d950>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']})
        lineage = self._retrieve_value([], datarow, 'taxonomy')
        #exit(lineage)  ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']
        if isinstance(lineage, str):
            delimiter = csv.Sniffer().sniff(lineage).delimiter
            lineage = [x.strip() for x in lineage.split(delimiter)]
            #exit(lineage)  ['k__Bacteria', 'k__Bacteria']
        taxonomy = {'lineage': lineage}
        

        for key in ['score', 'taxonomy_source', 'species_name']:
            val = self._retrieve_value([], datarow, key)
            if val:
                taxonomy[key] = val
        #exit(key) species_name
        for item in lineage[::-1]:
            scientific_name = item.split('_')[-1]
            taxon_level_char = item.split('_')[0]
            if scientific_name:
                taxon_id = self._search_taxon(scientific_name)
                if taxon_id:
                    taxon_ref = f"{self.taxon_wsname}/{taxon_id}"
                    taxon_level = self._fetch_taxon_level(taxon_level_char)

                    taxonomy.update({'taxon_ref': taxon_ref,
                                     'taxon_id': taxon_id,
                                     'scientific_name': scientific_name,
                                     'taxon_level': taxon_level})
                    break
        #exit(taxonomy) {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}
        return taxonomy

    def _retrieve_tsv_amplicon_set_data(self, tsv_file):              #tsv file is data/amplicon_test.tsv
        amplicons = dict()
        
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide TSV file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start processing each row in TSV')
        for observation_id in df.index:
            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'],
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished parsing TSV file')
        
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''

    def _retrieve_tsv_fasta_amplicon_set_data(self, tsv_file, fasta_file):
        #tsvfile = data/amplicon_test.tsv
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")      #{'GG_OTU_1' : SeqRecord(...), ...}
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide FASTA file')

        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide TSV file')

        logging.info('start processing files')
        for observation_id in df.index:
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                        'taxonomy': taxonomy}
            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_6': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''

    def _retrieve_biom_fasta_amplicon_set_data(self, biom_file, fasta_file):
        #exit(biom_file)  data/phyloseq_test.biom
        amplicons = dict()
        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide FASTA file')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in fastq_dict:
                raise ValueError('FASTA file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(observation_metadata[index])

            amplicon = {'consensus_sequence': str(fastq_dict.get(observation_id).seq),
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        return amplicons
        '''
        {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}
        '''
        

    def _retrieve_biom_tsv_amplicon_set_data(self, biom_file, tsv_file):
        amplicons = dict()
        try:
            logging.info('start parsing TSV file')
            reader = pd.read_csv(tsv_file, sep=None, iterator=True)
            inferred_sep = reader._engine.data.dialect.delimiter
            df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
        except Exception:
            raise ValueError('Cannot parse file. Please provide valide tsv file')

        if 'consensus_sequence' not in df.columns.tolist():
            raise ValueError('TSV file does not include consensus_sequence')

        logging.info('start parsing BIOM file')
        table = biom.load_table(biom_file)

        observation_ids = table._observation_ids.tolist()
        observation_metadata = table._observation_metadata

        logging.info('start processing files')
        for index, observation_id in enumerate(observation_ids):
            if observation_id not in df.index:
                raise ValueError('TSV file does not have [{}] OTU id'.format(observation_id))

            taxonomy = self._fetch_taxonomy(df.loc[observation_id])

            amplicon = {'consensus_sequence': df.loc[observation_id, 'consensus_sequence'],
                        'taxonomy': taxonomy}

            amplicons.update({observation_id: amplicon})

        logging.info('finished processing files')
        '''
        {'GG_OTU_1': {'consensus_sequence': 'AACCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_2': {'consensus_sequence': 'TTGGCC', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_3': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_1', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_4': {'consensus_sequence': 'AACCTT', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}, 'GG_OTU_5': {'consensus_sequence': 'TTCCGG', 'taxonomy': {'lineage': ['k__Bacteria', 'k__Bacteria'], 'taxonomy_source': 'source_2', 'taxon_ref': 'ReferenceTaxons/2_taxon', 'taxon_id': '2_taxon', 'scientific_name': 'Bacteria', 'taxon_level': 'Kingdom'}}}
        '''
        return amplicons

    def _file_to_amplicon_set_data(self, biom_file, tsv_file, fasta_file, mode, refs, description,
                                   matrix_obj_ref):

        logging.info('start parsing amplicon_set_data')

        amplicon_set_data = dict()

        if mode == 'biom_tsv':
            amplicons = self._retrieve_biom_tsv_amplicon_set_data(biom_file, tsv_file)
        elif mode == 'biom_fasta':
            amplicons = self._retrieve_biom_fasta_amplicon_set_data(biom_file, fasta_file)
        elif mode == 'tsv_fasta':
            amplicons = self._retrieve_tsv_fasta_amplicon_set_data(tsv_file, fasta_file)
        elif mode == 'tsv':
            amplicons = self._retrieve_tsv_amplicon_set_data(tsv_file)
        else:
            raise ValueError('error parsing _file_to_amplicon_set_data, mode: {}'.format(mode))

        amplicon_set_data.update({'amplicons': amplicons})

        if 'reads_set_ref' in refs:
            amplicon_set_data['reads_set_ref'] = refs.get('reads_set_ref')

        if description:
            amplicon_set_data['description'] = description

        matrix_obj_ref_array = matrix_obj_ref.split('/')
        amplicon_set_data['amplicon_matrix_ref'] = '{}/{}'.format(matrix_obj_ref_array[0],
                                                                  matrix_obj_ref_array[1])
        '''
        {'amplicons': {'GG_OTU_1': {'consensus_sequence': 'ACTGACTAGCTAGCTAACTG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}, 'GG_OTU_2': {'consensus_sequence': 'GCATCGTAGCTAGCTACGAT', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__'], 'taxon_ref': 'ReferenceTaxons/748770_taxon', 'taxon_id': '748770_taxon', 'scientific_name': 'Dolichospermum', 'taxon_level': 'Genus'}}, 'GG_OTU_3': {'consensus_sequence': 'CATCGATCGTACGTACGTAG', 'taxonomy': {'lineage': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__'], 'taxon_ref': 'ReferenceTaxons/2207_taxon', 'taxon_id': '2207_taxon', 'scientific_name': 'Methanosarcina', 'taxon_level': 'Genus'}}, 'GG_OTU_4': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum'], 'taxon_ref': 'ReferenceTaxons/2330_taxon', 'taxon_id': '2330_taxon', 'scientific_name': 'Halanaerobium', 'taxon_level': 'Genus'}}, 'GG_OTU_5': {'consensus_sequence': 'ATCGATCGATCGTACGATCG', 'taxonomy': {'lineage': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__'], 'taxon_ref': 'ReferenceTaxons/561_taxon', 'taxon_id': '561_taxon', 'scientific_name': 'Escherichia', 'taxon_level': 'Genus'}}}, 'description': 'OTU data', 'amplicon_matrix_ref': '44071/21'}
        '''
        return amplicon_set_data

    def _file_to_amplicon_data(self, biom_file, tsv_file, mode, refs, matrix_name, workspace_id,
                               scale, description, metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {'row_ids': table._observation_ids.tolist(),
                           'col_ids': table._sample_ids.tolist(),
                           'values': table.matrix_data.toarray().tolist()}

            logging.info('start building attribute mapping object')
            amplicon_data.update(self.get_attribute_mapping("row", observation_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))
            amplicon_data.update(self.get_attribute_mapping("col", sample_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError('Cannot parse file. Please provide valide tsv file')
            else:
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError('TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                matrix_data = {'row_ids': df.index.tolist(),
                               'col_ids': df.columns.tolist(),
                               'values': df.values.tolist()}

            logging.info('start building attribute mapping object')
            amplicon_data.update(self.get_attribute_mapping("row", observation_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id, metadata_df))
            amplicon_data.update(self.get_attribute_mapping("col", sample_metadata,
                                                            matrix_data, matrix_name, refs,
                                                            workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError('error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [f'{k}|{v}' for k, v in amplicon_data['attributes'].items()]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description
        '''
        {'col_attributemapping_ref': '44071/33/24', 'row_attributemapping_ref': '44071/19/119', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'}
        '''
        return amplicon_data

    def get_attribute_mapping(self, axis, metadata, matrix_data, matrix_name, refs,  workspace_id,
                              metadata_df=None):
        '''
        getting mapping data based on refs or metadata or metadata_df
        '''
        #exit(metadata)
        '''
        (defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf730>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf9d8>, {'taxonomy': ['k__Bacteria', 'p__Cyanobacteria', 'c__Nostocophycideae', 'o__Nostocales', 'f__Nostocaceae', 'g__Dolichospermum', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35faf6a8>, {'taxonomy': ['k__Archaea', 'p__Euryarchaeota', 'c__Methanomicrobia', 'o__Methanosarcinales', 'f__Methanosarcinaceae', 'g__Methanosarcina', 's__']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafd08>, {'taxonomy': ['k__Bacteria', 'p__Firmicutes', 'c__Clostridia', 'o__Halanaerobiales', 'f__Halanaerobiaceae', 'g__Halanaerobium', 's__Halanaerobiumsaccharolyticum']}), defaultdict(<function Table._cast_metadata.<locals>.cast_metadata.<locals>.<lambda> at 0x7fbe35fafea0>, {'taxonomy': ['k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae', 'g__Escherichia', 's__']}))

        '''

        #exit(matrix_data)  {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}
        #exit(matrix_name) test_AmpliconMatrix
        #exit(refs)  {'col_attributemapping_ref': '44071/33/51'}
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        #exit(axis_ids)  ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']
        if refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs': [refs[f'{axis}_attributemapping_ref']]}
            )['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(f"The following {name} IDs from the uploaded matrix do not match "
                                 f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                                 f"\nPlease verify the input data or upload an excel file with a"
                                 f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        '''
        {'row_attributemapping_ref': '44071/19/122', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5', 'GG_OTU_6': 'GG_OTU_6'}} 
        '''
        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name, ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in attribute_keys]

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()

        logging.info('start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id": ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        # 44071/19/128
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name, ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{'attribute': key, 'source': 'upload'} for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [str(meta[attr]) for attr in metadata_keys]

        logging.info('start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id": ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        # 44071/19/134
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_report(self, matrix_obj_ref, amplicon_set_obj_ref, new_row_attr_ref,
                         new_col_attr_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        objects_created = [{'ref': matrix_obj_ref, 'description': 'Imported Amplicon Matrix'},
                           {'ref': amplicon_set_obj_ref, 'description': 'Imported Amplicon Set'}]

        if new_row_attr_ref:
            objects_created.append({'ref': new_row_attr_ref,
                                    'description': 'Imported Amplicons(Row) Attribute Mapping'})

        if new_col_attr_ref:
            objects_created.append({'ref': new_col_attr_ref,
                                    'description': 'Imported Samples(Column) Attribute Mapping'})

        report_params = {'message': '',
                         'objects_created': objects_created,
                         'workspace_name': workspace_name,
                         'report_object_name': 'import_matrix_from_biom_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}
        #{'report_name': 'import_matrix_from_biom_db306341-c03a-4e60-b8a4-2bd7f6a48925', 'report_ref': '44071/200/1'}
        return report_output

    def _df_to_tsv(self, amplicon_set_df, result_dir, amplicon_set_ref):    #not going to be used anywhere
        logging.info('writting amplicon set data frame to tsv file')
        amplicon_set_obj = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]
        amplicon_set_info = amplicon_set_obj['info']
        amplicon_set_name = amplicon_set_info[1]

        file_path = os.path.join(result_dir, amplicon_set_name + ".tsv")

        amplicon_set_df.to_csv(file_path, sep='\t', index=True, header=True)

        return file_path

    def _amplicon_set_to_df(self, amplicon_set_ref):                       #not going to be used anywhere
        logging.info('converting amplicon set to data frame')
        am_set_data = self.dfu.get_objects({'object_refs': [amplicon_set_ref]})['data'][0]['data']

        amplicon_matrix_ref = am_set_data.get('amplicon_matrix_ref')
        matrix_data = self.dfu.get_objects({'object_refs': [amplicon_matrix_ref]})['data'][0]['data']
        matrix_value_data = matrix_data.get('data')

        index = matrix_value_data.get('row_ids')
        columns = matrix_value_data.get('col_ids')
        values = matrix_value_data.get('values')

        df = pd.DataFrame(values, index=index, columns=columns)

        amplicons = am_set_data.get('amplicons')

        meta_index = list()

        meta_columns = ['taxonomy', 'taxon_id', 'taxon_ref', 'taxon_level', 'score',
                        'taxonomy_source', 'species_name', 'consensus_sequence']
        meta_values = list()
        for otu_id, amplicon in amplicons.items():
            meta_index.append(otu_id)

            taxonomy_data = amplicon.get('taxonomy')

            taxonomy = taxonomy_data.get('lineage')
            taxon_id = taxonomy_data.get('taxon_id')
            taxon_ref = taxonomy_data.get('taxon_ref')
            taxon_level = taxonomy_data.get('taxon_level')
            score = taxonomy_data.get('score')
            taxonomy_source = taxonomy_data.get('taxonomy_source')
            species_name = taxonomy_data.get('species_name')

            consensus_sequence = amplicon.get('consensus_sequence')

            meta_values.append([taxonomy, taxon_id, taxon_ref, taxon_level, score, taxonomy_source,
                                species_name, consensus_sequence])

        meta_df = pd.DataFrame(meta_values, index=meta_index, columns=meta_columns)

        merged_df = df.merge(meta_df, left_index=True, right_index=True, how='left',
                             validate='one_to_one')
        
        return merged_df
   
    def export_amplicon_set_tsv(self, params):   # not goign to be called anywhere
        """
        export AmpliconSet as TSV
        """
        logging.info('start exporting amplicon set object')
        amplicon_set_ref = params.get('input_ref')

        amplicon_set_df = self._amplicon_set_to_df(amplicon_set_ref)

        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        self._df_to_tsv(amplicon_set_df, result_dir, amplicon_set_ref)

        package_details = self.dfu.package_for_download({
            'file_path': result_dir,
            'ws_refs': [amplicon_set_ref]
        })

        return {'shock_id': package_details['shock_id']}
コード例 #15
0
class expsfileuploadUtil:
    def __init__(self, params):
        self.params = params
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.dfu = DataFileUtil(self.callback_url)
        self.data_folder = os.path.abspath("/kb/module/data/")
        # This is where files from staging area exist
        self.staging_folder = os.path.abspath("/staging/")
        self.shared_folder = params["shared_folder"]
        self.scratch_folder = os.path.join(params["shared_folder"], "scratch")

    def upload_expsfile(self):
        """
        The upload method

        We perform a number of steps:
        Get name of expsfile as it is in staging.
        Find the expsfile in /staging/expsfile_name
        Get the output name for the expsfile
        Get the column headers for the exps file for
            data and testing purposes. 
        Test if expsfile is well-formed.
        We send the file to shock using dfu.
        We get the handle and save the object with all
            the necessary information- including related genome.
        params should include:
            username,
            staging_file_name,
            genome_ref,
            description,
            output_name
        """

        print("params: ", self.params)
        self.validate_import_expsfile_from_staging_params()

        # Name of file in staging: (file name or absolute path?)
        staging_exps_fp_name = self.params["staging_file_name"]

        # Output name of exps file:
        expsfile_name = self.params["output_name"]

        print("expsfile_name: ", expsfile_name)
        print("top dir /:", os.listdir("/"))
        print("/kb/module/:", os.listdir("/kb/module"))
        if not os.path.exists(self.staging_folder):
            raise Exception("Staging dir does not exist yet!")
        else:
            print("Succesfully recognized staging directory")

        # This is the path to the exps file
        expsfile_fp = os.path.join(self.staging_folder, staging_exps_fp_name)

        # We check correctness of exps file. Returns list and int
        column_header_list, num_rows, setNames = self.check_exps_file(
                                                                    expsfile_fp)

        # We copy the file from staging to scratch
        new_exps_fp = os.path.join(self.shared_folder, expsfile_name)
        shutil.copyfile(expsfile_fp, new_exps_fp)
        expsfile_fp = new_exps_fp

        # We create the handle for the object:
        file_to_shock_result = self.dfu.file_to_shock(
            {"file_path": expsfile_fp, "make_handle": True, "pack": "gzip"}
        )

        # The following var res_handle only created for simplification of code
        res_handle = file_to_shock_result["handle"]

        # We create a better Description by adding date time and username
        date_time = datetime.datetime.utcnow()
        #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format(
        #        self.params['username'], str(date_time))

        # We create the data for the object
        exps_data = {
            "file_type": "KBasePoolTSV.Experiments",
            "expsfile": res_handle["hid"],
            # below should be shock
            "handle_type": res_handle["type"],
            "shock_url": res_handle["url"],
            "shock_node_id": res_handle["id"],
            "compression_type": "gzip",
            "file_name": res_handle["file_name"],
            "utc_created": str(date_time),
            "column_header_list": column_header_list,
            "num_lines": str(num_rows),
            "related_genome_ref": self.params["genome_ref"],
            "related_organism_scientific_name": self.get_genome_organism_name(
                self.params["genome_ref"]
            ),
            "description": self.params["description"],
        }

        # To get workspace id:
        ws_id = self.params["workspace_id"]
        save_object_params = {
            "id": ws_id,
            "objects": [
                {
                    "type": "KBasePoolTSV.Experiments",
                    "data": exps_data,
                    "name": expsfile_name,
                }
            ],
        }
        # save_objects returns a list of object_infos
        dfu_object_info = self.dfu.save_objects(save_object_params)[0]
        print("dfu_object_info: ")
        print(dfu_object_info)
        return {
            "Name": dfu_object_info[1],
            "Type": dfu_object_info[2],
            "Date": dfu_object_info[3],
        }

    def validate_import_expsfile_from_staging_params(self):

        # check for required parameters
        for p in [
            "username",
            "staging_file_name",
            "genome_ref",
            "description",
            "output_name"
        ]:
            if p not in self.params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def check_exps_file(self, expsfile_fp):

        required = [
            "SetName",
            "Index",
            "Description",
            "Date_pool_expt_started",
        ]

        cols, num_rows, setNames = self.read_table(expsfile_fp, required)

        return [cols, num_rows, setNames]

    def read_table(self, fp, required):
        """
        Following function takes a filename and a list of required fields i
        (file is TSV)
        returns list of headers
        Does not return header line
        """
        with open(fp, "r") as f:
            file_str = f.read()
        file_list = file_str.split("\n")
        header_line = file_list[0]
        # Check for Mac Style Files
        if re.search(r"\t", header_line) and re.search(r"\r", header_line):
            raise Exception(
                (
                    "Tab-delimited input file {} is a Mac-style text file "
                    "which is not supported.\n"
                    "Use\ndos2unix -c mac {}\n to convert it to a Unix "
                    "text file.\n"
                ).format(fp, fp)
            )
        cols = header_line.split("\t")
        cols_dict = {}
        for i in range(len(cols)):
            cols_dict[cols[i]] = i
        for field in required:
            if field not in cols_dict:
                raise Exception(
                    "No field {} in {}. Must include fields".format(field, fp)
                    + "\n{}".format(" ".join(required))
                )
        rows = []
        # This is unique to Experiments
        setNames = []
        for i in range(1, len(file_list)):
            line = file_list[i]
            # if last line empty
            if len(line) == 0:
                continue
            line = re.sub(r"[\r\n]+$", "", line)
            split_line = line.split("\t")
            setNames.append(split_line[0])
            if not len(split_line) == len(cols):
                raise Exception(
                    "Wrong number of columns in:\n{}\nin {} l:{}".format(line, fp, i)
                )
            new_dict = {}
            for i in range(len(cols)):
                new_dict[cols[i]] = split_line[i]
            rows.append(new_dict)

        return [cols, len(file_list), setNames]

    def get_genome_organism_name(self, genome_ref):
        # Getting the organism name using WorkspaceClient
        ws = self.params["ws_obj"]
        res = ws.get_objects2(
            {"objects": [{"ref": genome_ref, "included": ["scientific_name"]}]}
        )
        scientific_name = res["data"][0]["data"]["scientific_name"]
        return scientific_name
コード例 #16
0
class PDBUtil:

    # “Expect Value” threshold to restrict which alignments will be significant
    E_VALUE_THRESH = 1e-20

    # BLAST sequence identity threshold to determine which pdb structures will be
    # matched to a KBase genome/feature
    B_IDENTITY_THRESH = 0.6

    def _validate_import_pdb_file_params(self, params):
        """
            _validate_import_pdb_file_params:
                validates input params to import_model_pdb_file and import_experiment_pdb_file
        """
        # check for required parameters
        for p in ['structure_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params.get('workspace_name'), params.get(
            'structure_name')

    def _model_file_to_data(self, file_path, params):
        """
            _model_file_to_data:
                Do the PDB conversion--parse the model pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.PDBParser(PERMISSIVE=1)
        pdb1 = file_path
        pp_no = 0
        data = {}

        try:
            structure = parser.get_structure("test", pdb1)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'PDBParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            # logging.info(f'Getting pdb structure data for {structure}!')
            (compound, source) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            model = structure[0]
            protein_data = self._get_proteins_by_structure(
                structure, model.get_id(), file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                data = {
                    'name': structure.header.get('name', ''),
                    'num_chains': num_chains,
                    'num_residues': num_residues,
                    'num_atoms': num_atoms,
                    'compound': compound,
                    'source': source,
                    'proteins': protein_data
                }
            else:
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
                data = {}
        finally:
            return data, pp_no, params

    def _exp_file_to_data(self, file_path, params):
        """
            _exp_file_to_data:
                Do the PDB conversion--parse the experiment pdb file for creating a pdb data object
        """
        logging.info(
            f'Parsing pdb file {file_path} to a pdb structure with params: {params}'
        )

        parser = PDB.MMCIFParser()
        cif = file_path
        pp_no = 0
        mmcif_data = None

        try:
            structure = parser.get_structure("PHA-L", cif)
        except (RuntimeError, TypeError, KeyError, ValueError) as e:
            logging.info(f'MMCIFParser errored with message: {e.message}')
            raise
        else:
            ppb = PPBuilder()
            for pp in ppb.build_peptides(structure):
                pp_no += 1

            struc_name = structure.header.get('name', '')
            hd = self._upload_to_shock(file_path)

            # logging.info(f'Getting pdb structure data for {structure}!')
            (cpd, src) = self._get_compound_source(structure)
            (num_models,
             model_ids) = self._get_models_from_structure(structure)
            (num_chains,
             chain_ids) = self._get_chains_from_structure(structure)
            (num_residues,
             residue_ids) = self._get_residues_from_structure(structure)
            (num_atoms, atom_ids) = self._get_atoms_from_structure(structure)
            protein_data = self._get_proteins_by_structure(
                structure, model_ids[0], file_path)
            (protein_data, params) = self._match_features(params, protein_data)

            pdb_info = params.get('pdb_info', None)
            if pdb_info and pdb_info.get('sequence_identities', None):
                mmcif_data = {
                    'name':
                    struc_name,
                    'head':
                    structure.header.get('head', ''),
                    'rcsb_id':
                    structure.header.get('rcsb_id', ''),
                    'deposition_date':
                    structure.header.get('deposition_date', ''),
                    'release_date':
                    structure.header.get('release_date', ''),
                    'structure_method':
                    structure.header.get('structure_method', ''),
                    'resolution':
                    structure.header.get('resolution', 0.0),
                    'structure_reference':
                    structure.header.get('structure_reference', []),
                    'keywords':
                    structure.header.get('keywords', ''),
                    'author':
                    structure.header.get('author', ''),
                    'compound':
                    cpd,
                    'source':
                    src,
                    'num_models':
                    num_models,
                    'num_chains':
                    num_chains,
                    'num_residues':
                    num_residues,
                    'num_atoms':
                    num_atoms,
                    'num_het_atoms':
                    structure.header.get('num_het_atoms', 0),
                    'num_water_atoms':
                    structure.header.get('num_water_atoms', 0),
                    'num_disordered_atoms':
                    structure.header.get('num_disordered_atoms', 0),
                    'num_disordered_residues':
                    structure.header.get('num_disordered_residues', 0),
                    'pdb_handle':
                    hd,
                    'mmcif_handle':
                    hd,
                    'xml_handle':
                    hd,
                    'proteins':
                    protein_data
                }
            else:
                mmcif_data = {}
                logging.info(
                    f'Parsing pdb file {file_path} failed to match KBase genome/features!'
                )
        finally:
            return mmcif_data, pp_no, params

    def _match_features(self, params, protein_data):
        """
            _match_features: match the protein_translation in feature_id with chain sequences in
                             protein_data and compute the seq_identity and determine the exact_match
            example (in appdev):
                    genome_obj = '57196/6/1', genome_name = 'Synthetic_bacterium_JCVI_Syn3.0_genome'
                    feature_id = 'JCVISYN3_0004_CDS_1', feature_type = 'CDS' OR
                    feature_id = 'JCVISYN3_0004', feature_type = 'gene'
        """
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            kb_feature_type = ''
            kb_feature_seq = ''
            genome_name = pdb_info['genome_name']
            narr_id = pdb_info['narrative_id']
            feature_id = pdb_info['feature_id']

            logging.info(
                f"Looking up for feature {feature_id} in genome {genome_name}'s features"
            )
            # 1. Get the genome's features and reference
            (gn_ref, kb_genome_features) = self._get_genome_ref_features(
                narr_id, genome_name)
            if not gn_ref:
                logging.info(
                    f"Given genome {genome_name} does not exist in workspace {narr_id}!"
                )
                return protein_data, params

            pdb_info['genome_ref'] = gn_ref
            # 2. Match the genome features with the specified feature_id to obtain feature sequence
            for feat in kb_genome_features:
                if feat['id'] == feature_id:
                    logging.info(
                        f'Found genome feature match for {feature_id}')
                    kb_feature_type = self._get_feature_type(feat)
                    kb_feature_seq = feat.get('protein_translation', '')
                    break

            pdb_info['feature_type'] = kb_feature_type

            # 3. Call self._compute_sequence_identity with the feature sequence and the the pdb
            # proteins' translations to to get the seq_identity and exact_match
            if kb_feature_seq:
                logging.info(
                    f"Finding seq_identity and exact_match for feature {feature_id}"
                    f" in genome {genome_name}'s features...")
                pdb_chain_ids = []
                pdb_model_ids = []
                pdb_seq_idens = []
                pdb_exact_matches = []
                for prot in protein_data:
                    seq_idens, seq_mats = self._compute_sequence_identity(
                        kb_feature_seq, prot.get('sequence', ''))
                    if seq_idens:
                        seq_idens.sort()
                        max_iden = seq_idens.pop()
                        if max_iden >= self.B_IDENTITY_THRESH:  # get the good matches
                            prot['seq_identity'] = max_iden
                            prot['exact_match'] = 1 if max_iden > 0.99 else 0
                            prot['genome_ref'] = gn_ref
                            prot['feature_id'] = feature_id
                            prot['feature_type'] = kb_feature_type
                            pdb_chain_ids.append(prot['chain_id'])
                            pdb_model_ids.append(str(prot['model_id']))
                            pdb_seq_idens.append(str(prot['seq_identity']))
                            pdb_exact_matches.append(str(prot['exact_match']))

                if pdb_seq_idens:
                    pdb_info['sequence_identities'] = ','.join(pdb_seq_idens)
                if pdb_chain_ids:
                    pdb_info['chain_ids'] = ','.join(pdb_chain_ids)
                if pdb_model_ids:
                    pdb_info['model_ids'] = ','.join(pdb_model_ids)
                if pdb_exact_matches:
                    pdb_info['exact_matches'] = ','.join(pdb_exact_matches)
            else:
                logging.info(
                    f'Found NO feature in genome that matches with {feature_id}'
                )
        else:
            logging.info(
                'NO KBase genome/feature object info were given for uploading')

        return protein_data, params

    def _compute_sequence_identity(self, seq1, seq2):
        """
            _compute_sequence_identity: Given two input sequences, do a blast identity check and
                                        then compute and return the matching percentage.
        """
        # Create two sequence files
        Seq1 = SeqRecord(Seq(seq1), id="query_seq")
        Seq2 = SeqRecord(Seq(seq2), id="subject_seq")

        blast_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(blast_dir)
        query_seq = os.path.join(blast_dir, 'seq_qry.fasta')
        subject_seq = os.path.join(blast_dir, 'seq_sbj.fasta')
        SeqIO.write(Seq1, query_seq, "fasta")
        SeqIO.write(Seq2, subject_seq, "fasta")

        # on my laptop: blastp_path = '/Users/qzhang/miniconda3/bin/blastp'
        blastp_path = 'blastp'
        output_file_path = os.path.join(blast_dir, 'blast_output.xml')

        # Build the BLASTp command
        blastp_cmd = [blastp_path]
        blastp_cmd.append('-out')
        blastp_cmd.append(output_file_path)
        blastp_cmd.append('-outfmt')
        blastp_cmd.append('5')
        blastp_cmd.append('-query')
        blastp_cmd.append(query_seq)
        blastp_cmd.append('-subject')
        blastp_cmd.append(subject_seq)

        # Run BLASTp and parse the output as XML and then parse the xml file for identity matches
        exact_matches = []
        idens = []
        try:
            p = subprocess.Popen(blastp_cmd,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 universal_newlines=True)
            output, errors = p.communicate()
            if not output:
                logging.info(f'BLASTp returned: {p.returncode}')
                logging.info(f'OK> output: {output}')
            if errors:
                e = subprocess.CalledProcessError(p.returncode,
                                                  blastp_cmd,
                                                  output=output)
                raise e
        except OSError as e:
            logging.info(f'OSError > {e.errno}')
            logging.info(f'OSError > {e.strerror}')
            logging.info(f'OSError > {e.filename}')
        except subprocess.CalledProcessError as e:
            logging.info(f'CalledError > {e.returncode}')
            logging.info(f'CalledError > {e.output}')
        except:
            logging.info(f'Unexpected error > {sys.exc_info()[0]}')
        else:
            with open(output_file_path) as blast_fhd:
                blast_record = NCBIXML.read(blast_fhd)
                if blast_record:
                    logging.info(f'query: {blast_record.query[:100]}')
                    for alignment in blast_record.alignments:
                        for hsp in alignment.hsps:
                            if hsp.expect < self.E_VALUE_THRESH:
                                logging.info('****Alignment****')
                                logging.info(f'sequence: {alignment.title}')
                                logging.info(f'length: {alignment.length}')
                                logging.info(f'e value: {hsp.expect}')
                                logging.info(f'hsp query: {hsp.query}')
                                logging.info(f'hsp match: {hsp.match}')
                                logging.info(f'hsp subject: {hsp.sbjct}')
                                logging.info(
                                    f'hsp identities: {hsp.identities}')
                                logging.info(f'hsp positives: {hsp.positives}')
                                iden = round(hsp.identities / hsp.positives, 6)
                                logging.info(f'identity={iden}')
                                idens.append(iden)
                                if hsp.positives == hsp.identities:
                                    exact_matches.append(alignment.title[:100])
        return idens, exact_matches

    def _get_genome_ref_features(self, narr_id, genome_name):
        """
            _get_genome_ref_features: Get the genome reference and features for genome_name
        """
        genome_ref = ''
        genome_features = []
        (genome_info,
         genome_data) = self._get_object_info_data(narr_id, genome_name)
        if genome_info and genome_data:
            genome_ref = '/'.join(
                [str(narr_id),
                 str(genome_info[0]),
                 str(genome_info[4])])
            genome_features = genome_data['features']

        return (genome_ref, genome_features)

    def _get_feature_type(self, feature_obj):
        """
            _get_feature_type: Get the type for the feature object of given feature_obj
        """
        feat_type = feature_obj.get('type', '')
        if not feat_type:
            if feature_obj.get('protein_translation'):
                feat_type = 'gene'
            else:
                feat_type = 'other'

        return feat_type

    def _get_object_info_data(self, narr_id, obj_name):
        """
            _get_object_info_data: Get the object info/data with given obj_name in narrative narr_id
        """
        obj_info = None
        obj_data = None
        if narr_id and obj_name:
            try:
                obj_data_res = self.ws_client.get_objects2(
                    {'objects': [{
                        'wsid': narr_id,
                        'name': obj_name
                    }]})['data'][0]
                obj_info = obj_data_res['info']
                obj_data = obj_data_res['data']
            except:
                logging.info(
                    f'No object with name {obj_name} exists in workspace {narr_id}'
                )
                logging.info(
                    f'Unexpected error occurred while getting object for {obj_name}'
                )
                pass

        return (obj_info, obj_data)

    def _get_atoms_from_structure(self, pdb_structure):
        """
            _get_atoms_from_structure: Given a pdb_structure object, parse atoms into a list of
                                        atoms and return it
        """
        atom_ids = []
        num_atoms = 0
        my_residues = pdb_structure.get_residues()
        for r_ele in my_residues:
            for a_ele in r_ele.get_atoms():
                num_atoms += 1
                atom_ids.append(a_ele.get_id())

        return (num_atoms, atom_ids)

    def _get_residues_from_structure(self, pdb_structure):
        """
            _get_residues_from_structure: Given a pdb_structure object, parse residues into a list
                                          and return it
        """
        res_ids = []
        num_res = 0
        my_res = pdb_structure.get_residues()
        for r_ele in my_res:
            if PDB.is_aa(r_ele):
                num_res += 1
                res_ids.append(r_ele.get_id())

        return (num_res, res_ids)

    def _get_chains_from_structure(self, pdb_structure):
        """
            _get_chains: Given a pdb_structure object, parse chain ids into a list and return it
        """
        chain_ids = []
        num_chains = 0
        my_chains = pdb_structure.get_chains()
        for c_ele in my_chains:
            if (c_ele):
                num_chains += 1
                chain_ids.append(c_ele.get_id())

        return (num_chains, chain_ids)

    def _get_models_from_structure(self, pdb_structure):
        """
            _get_models_from_structure: Given a pdb_structure object, parse model ids into a list
                                        and return it
        """
        model_ids = []
        num_models = 0
        my_models = pdb_structure.get_models()
        for m_ele in my_models:
            if (m_ele):
                num_models += 1
                model_ids.append(m_ele.get_id())

        return (num_models, model_ids)

    def _get_compound_source(self, structure):
        """
            _get_compound_source: Parse data from given structure for compound and source
        """
        cpd_dict = dict()
        cpd = structure.header.get('compound', {})
        # logging.info(f'Compound:\n {cpd}')
        if cpd and cpd.get('1'):
            cpd_dict = cpd.get('1')

        src_dict = dict()
        src = structure.header.get('source', {})
        # logging.info(f'Source:\n {src}')
        if src and src.get('1'):
            src_dict = src.get('1')

        return (cpd_dict, src_dict)

    def _get_proteins_by_structure(self, pdb_structure, model, file_path):
        """
            _get_proteins_by_structure: Given a pdb_structure, parse the essential protein data
        """
        ppb = PPBuilder()
        protein_data = []

        # Parse for the chain_id and chain sequence
        for c_ele in pdb_structure.get_chains():
            if (c_ele):
                c_ppd_list = []
                for c_ppd in ppb.build_peptides(c_ele):
                    c_pp_seq = str(c_ppd.get_sequence())
                    c_ppd_list.append(c_pp_seq)
                c_seq = ''.join(c_ppd_list)
                protein_data.append({
                    'id':
                    os.path.basename(file_path),
                    'model_id':
                    model,
                    'chain_id':
                    c_ele.get_id(),
                    'sequence':
                    c_seq,
                    'md5':
                    hashlib.md5(c_seq.encode()).hexdigest()
                })

        return protein_data

    def _validate_file(self, file_path):
        """
            _validate_file: Check if file_path is accessable, if yes, return the handle
        """
        try:
            fh = open(file_path, 'r')
        except IOError as e:
            if e.errno == errno.ENOENT:  # No such file or directory
                raise ValueError(f'"{file_path}" does not exist!')
            elif e.errno == errno.EACCES:  # Permission denied
                raise ValueError(f'"{file_path}" cannot be read!')
            else:
                raise ValueError(f'"{e.strerror}" error occurred')
        else:
            fh.close()
            return True

    def _dfu_get_objects(self, obj_ref):
        """
            _dfu_get_objects: call dfu.get_objects to return object data and info
        """
        obj = self.dfu.get_objects({"object_refs": [obj_ref]})['data'][0]
        return obj['data'], obj['info']

    def _get_pdb_shock_id(self, obj_ref):
        """
            _get_pdb_shock_id: Return the shock id for the PDB file
        """
        obj_data, obj_info = self._dfu_get_objects(obj_ref)
        return self.hs.hids_to_handles([obj_data['pdb_handle']])[0]['id']

    def _upload_to_shock(self, file_path):
        """
            _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info(f'Start uploading file to shock: {file_path}')

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(
            file_to_shock_params)['handle']['hid']

        return shock_id

    def _generate_report_html(self, pdb_name, pdb_path):
        """
            _generate_report_html: generates the HTML for the upload report
        """
        html_report = list()

        # Make report directory and copy over files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)
        result_file_path = os.path.join(output_directory, 'viewer.html')
        new_pdb_path = os.path.join(output_directory,
                                    os.path.basename(pdb_path))
        shutil.copy(pdb_path, new_pdb_path)

        # Fill in template HTML
        with open(
                os.path.join(os.path.dirname(__file__), 'templates',
                             'viewer_template.html')) as report_template_file:
            report_template = report_template_file.read()\
                .replace('*PDB_NAME*', pdb_name)\
                .replace('*PDB_PATH*', os.path.basename(pdb_path))

        with open(result_file_path, 'w') as result_file:
            result_file.write(report_template)

        html_report.append({
            'path': output_directory,
            'name': os.path.basename(result_file_path),
            'description': 'HTML report for PDB upload'
        })

        return html_report

    def _generate_report(self, method_name, pdb_obj_ref, workspace_name,
                         n_poly_pep, pdb_name, pdb_path):
        """
            _generate_report: generate summary report for upload
        """
        output_html_files = self._generate_report_html(pdb_name, pdb_path)

        report_params = {
            'message':
            f'You uploaded a PDB file. {n_poly_pep} polypeptides detected.',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref': pdb_obj_ref,
                'description': 'Imported PDB'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            method_name + '_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _validate_batch_import_pdbs_params(self, params):
        """
            _validate_batch_import_pdbs_params:
                validates params passed to batch_import_pdbs method
        """
        # check for required parameters
        for p in [
                'structures_name', 'workspace_name',
                'metadata_staging_file_path'
        ]:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

        # metadata_staging_file_path must be from the staging area--must have the staging dir prefix
        if params.get('metadata_staging_file_path', None):
            staging_file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('metadata_staging_file_path')
            }).get('copy_file_path')
            return (staging_file_path, params['workspace_name'],
                    params['structures_name'])
        else:
            error_msg = "Must supply a 'metadata_staging_file_path'"
            raise ValueError(error_msg)

    def _read_file_by_type(self, file_path):
        """
            _read_file_by_type: read the file given by file_path depending on its type,
                               return a DataFrame object
        """
        logging.info(f'Reading input from file: {file_path}...')

        if not self._validate_file(file_path):
            raise ValueError('Input file is invalid or not found!')

        df = None
        file_ext = pathlib.Path(file_path).suffix
        try:  # read the data from file_path depending on its extension
            if 'csv' in file_ext:
                df = pd.read_csv(file_path)
            elif 'tsv' in file_ext:
                df = pd.read_csv(file_path, '\t')
            elif 'xls' in file_ext or 'od' in file_ext:
                # handle xls, xlsx, xlsm, xlsb, odf, ods and odt file extensions
                df = pd.read_excel(file_path,
                                   index_col=None,
                                   engine='openpyxl')
            else:  # invalid file type
                error_msg = "Invalid input file type, only 'csv/tsv/xlsx' are accepted!"
                raise ValueError(error_msg)
            # strip off the leading and trailing whitespaces of the column names
            df.columns = df.columns.str.strip()
        except (RuntimeError, TypeError, KeyError, ValueError,
                WorkspaceError) as e:
            logging.info(
                f'Reading file {file_path} errored with message: {e.message} and data: {e.data}'
            )
            raise
        return df

    def _parse_metadata_file(self, metadata_file_path, ws_id):
        """
            _parse_metadata_file:
                From metadata_file_path, a spreadsheet file, sort out the model_pdb_file_paths,
            exp_pdb_file_paths and the kbase_meta_data

            return: lists model_pdb_file_paths, exp_pdb_file_paths and dict kbase_meta_data
        """
        logging.info(
            f'parsing metadata from input file {metadata_file_path}...')

        required_columns = [
            'Narrative ID', 'Object name (Genome AMA feature set)',
            'Feature ID', 'PDB filename', 'Is model', 'From RCSB'
        ]

        pdb_file_paths = list()
        narrative_ids = list()
        genome_names = list()
        feature_ids = list()

        # df_meta_data is a Panda DataFrame object
        df_meta_data = self._read_file_by_type(metadata_file_path)
        df_col_list = df_meta_data.columns.values.tolist()

        # check if required columns are read in correctly
        for col in required_columns:
            if col not in df_col_list:
                missing_required = f"Required column '{col}' is missing!"
                raise ValueError(missing_required)

        df_indexes = df_meta_data.columns
        for i in range(len(df_meta_data[df_indexes[0]])):
            narr_id = int(df_meta_data[df_indexes[0]][i])
            if not pd.isna(narr_id):
                narrative_ids.append(narr_id)
            else:
                missing_narr_id = "Please fill all the rows in column 'Narrative ID'!"
                raise ValueError(missing_narr_id)

            obj_name = df_meta_data[df_indexes[1]][i]
            if not pd.isna(obj_name):
                genome_names.append(obj_name)
            else:
                missing_obj_name = "Please fill all the rows in column 'Object name'!"
                raise ValueError(missing_obj_name)

            feat_id = df_meta_data[df_indexes[2]][i]
            if not pd.isna(feat_id):
                feature_ids.append(feat_id)
            else:
                missing_feature_id = f"Please fill all the rows in column '{required_columns[2]}'!"
                raise ValueError(missing_feature_id)

            pdb_fn = df_meta_data[df_indexes[3]][
                i]  # pdb_fn does not have staging dir prefix
            if pd.isna(pdb_fn):
                missing_pdb_file = f"Please fill all the rows in column '{required_columns[3]}'!"
                raise ValueError(missing_pdb_file)
            (struct_name, ext) = os.path.splitext(os.path.basename(pdb_fn))

            from_rcsb = df_meta_data[df_indexes[5]][
                i]  # pdb file source, default to 'yes'
            if pd.isna(from_rcsb):
                from_rcsb = 'yes'

            is_model = df_meta_data[df_indexes[4]][i]
            if not pd.isna(is_model):
                pdb_file_paths.append({
                    'file_path':
                    pdb_fn,
                    'structure_name':
                    struct_name,
                    'narrative_id':
                    narr_id,
                    'genome_name':
                    obj_name,
                    'feature_id':
                    feat_id,
                    'is_model':
                    'y' in is_model or 'Y' in is_model,
                    'from_rcsb':
                    'y' in from_rcsb or 'Y' in from_rcsb
                })
            else:
                missing_pdb_md = f"Please fill all the rows in columns '{required_columns[4]}'!"
                raise ValueError(missing_pdb_md)

        if not pdb_file_paths:
            error_msg = "No PDB file info is provided!"
            raise ValueError(error_msg)

        return (pdb_file_paths, narrative_ids, genome_names, feature_ids)

    def _generate_batch_report(self, workspace_name, structs_ref, structs_name,
                               pdb_infos, failed_pdbs):
        """
            _generate_batch_report: generate summary report for upload
        """

        output_html_files = self._generate_batch_report_html(
            structs_name, pdb_infos)

        description = (
            f'Imported PDBs into a ProteinStructures object "{structs_ref}", '
            f'named "{structs_name}".')

        if failed_pdbs:
            failed_files = ','.join(failed_pdbs)
            description += f' These files "{failed_files}" failed to load.'

        report_params = {
            'message':
            f'You have uploaded a batch of PDB files into {structs_name}.',
            'html_links':
            output_html_files,
            'direct_html_link_index':
            0,
            'objects_created': [{
                'ref': structs_ref,
                'description': description
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'batch_import_pdb_files_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _write_pdb_htmls(self, output_dir, succ_pdb_infos):
        """
            _write_pdb_htmls: write the batch pdb info as a jQuery DataTable into HTML files
        """

        pdb_html = ''
        srv_domain = urlparse(
            self.shock_url).netloc  # parse url to get the domain portion
        srv_base_url = f'https://{srv_domain}'
        logging.info(f'Get the url for building the anchors: {srv_base_url}')

        dir_name = os.path.dirname(__file__)
        molstar_html_file = os.path.join(dir_name, 'templates',
                                         'molstar_viewer.html')
        molstar_js_file = os.path.join(dir_name, 'templates', 'molstar.js')
        molstar_css_file = os.path.join(dir_name, 'templates', 'molstar.css')
        shutil.copy(molstar_html_file,
                    os.path.join(output_dir, 'molstar_viewer.html'))
        shutil.copy(molstar_js_file, os.path.join(output_dir, 'molstar.js'))
        shutil.copy(molstar_css_file, os.path.join(output_dir, 'molstar.css'))

        for succ_pdb in succ_pdb_infos:
            row_html = '<tr>'
            file_path = succ_pdb['file_path']
            pdb_file_path = succ_pdb[
                'scratch_path']  # This is the scratch path for this pdb file
            new_pdb_path = os.path.join(output_dir,
                                        os.path.basename(file_path))
            shutil.copy(pdb_file_path, new_pdb_path)

            struct_nm = succ_pdb['structure_name'].upper()
            genome_name = succ_pdb['genome_name']
            genome_ref = succ_pdb['genome_ref']
            feat_id = succ_pdb['feature_id']
            feat_type = succ_pdb['feature_type']
            src_rcsb = succ_pdb['from_rcsb']

            pdb_chains = []
            pdb_models = []
            seq_idens = []
            if succ_pdb.get('chain_ids', None):
                pdb_chains = succ_pdb['chain_ids'].split()
            if succ_pdb.get('model_ids', None):
                pdb_models = succ_pdb['model_ids'].split()
            if succ_pdb.get('sequence_identities', None):
                seq_idens = succ_pdb['sequence_identities'].split()

            if src_rcsb:
                row_html += (
                    f'<td>{struct_nm}<a href="https://www.rcsb.org/3d-view/{struct_nm}"'
                    f' target="_blank"> RCSB Structure</a></td>')
            else:
                row_html += (f'<td>{struct_nm}<a href="./molstar_viewer.html"'
                             f' or <a href="molstar_viewer.html"'
                             f' target="_blank"> MolStar Viewer</a></td>')

            row_html += (f'<td><a href="{srv_base_url}/#dataview/{genome_ref}"'
                         f' target="_blank">{genome_name}</a></td>'
                         f'<td>{feat_id}</td><td>{feat_type}</td>')
            row_html += f'<td>{pdb_models}</td>'
            row_html += f'<td>{pdb_chains}</td>'
            row_html += f'<td>{seq_idens}</td>'
            row_html += '</tr>'
            pdb_html += row_html
        return pdb_html

    def _generate_batch_report_html(self, prot_structs_name, succ_pdb_infos):
        """
            _generate_batch_report_html: generates the HTML for the upload report
        """
        html_report = list()

        # Make report directory and copy over uploaded pdb files
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(output_directory)

        # Create the template html file for reporting batch-uploaded pdb files
        batch_html_report_path = os.path.join(output_directory,
                                              'batch_pdb_viewer.html')

        pdb_html = self._write_pdb_htmls(output_directory, succ_pdb_infos)

        # Fetch & fill in detailed info into template HTML
        with open(
                os.path.join(
                    os.path.dirname(__file__), 'templates',
                    'batch_pdb_template.html')) as batch_template_html:
            batch_html_report = batch_template_html.read()\
                .replace('<!--replace this content-->', pdb_html)

        with open(batch_html_report_path, 'w') as html_report_file:
            html_report_file.write(batch_html_report)
        print(
            f'Full batch_html_report has been written to {batch_html_report_path}'
        )

        html_report.append({
            'path': output_directory,
            'name': os.path.basename(batch_html_report_path),
            'description': 'HTML report for PDB upload'
        })

        return html_report

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.user_id = config['USER_ID']
        self.dfu = DataFileUtil(self.callback_url)
        self.hs = AbstractHandle(config['handle-service-url'])
        self.ws_client = Workspace(config['workspace-url'])
        self.shock_url = config['shock-url']

    def import_model_pdb_file(self, params, create_report=True):
        """
            import_model_pdb_file: upload an experiment pdb file and convert into a
                                  KBaseStructure.ModelProteinStructure object
        """
        logging.info(
            f'import_model_pdb_file to a pdb data structure with params: {params}'
        )

        # file_path is the pdb file's working area path (after dfu.download_staging_file call)
        file_path, workspace_name, pdb_name = self._validate_import_pdb_file_params(
            params)

        (data, n_polypeptides,
         params) = self._model_file_to_data(file_path, params)
        if not data:
            logging.info(
                f'PDB file {file_path} import with "Import ModelProteinStructure" failed!'
            )
            return {}, {}

        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            pdb_info['scratch_path'] = file_path
        logging.info(f'Model structure data:{data}')
        return data, pdb_info

    def import_experiment_pdb_file(self, params, create_report=True):
        """
            import_experiment_pdb_file: upload an experiment pdb file and convert into a
                                       KBaseStructure.ExperimentalProteinStructure object
        """
        logging.info(
            f'import_experiment_pdb_file to a pdb structure with params: {params}'
        )

        # file_path is the pdb file's working area path (after dfu.download_staging_file call)
        file_path, workspace_name, mmcif_name = self._validate_import_pdb_file_params(
            params)

        # Parse the experimental pdb file for an experimental data structure
        (data, n_polypeptides,
         params) = self._exp_file_to_data(file_path, params)
        if not data:
            logging.info(
                f'Import {file_path} with "Import ExperimentalProteinStructure" failed!'
            )
            return {}, {}

        data['pdb_handle'] = self._upload_to_shock(file_path)
        data['user_data'] = params.get('description', '')
        pdb_info = params.get('pdb_info', None)
        if pdb_info:
            pdb_info['scratch_path'] = file_path
        logging.info(data)
        return data, pdb_info

    def _export_pdb(self, params):
        """
            _export_pdb: return the shock_id of the uploaded pdb object
        """
        if "input_ref" not in params:
            raise ValueError("'input_ref' not in supplied params")

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def _structure_to_pdb_file(self, params):
        """
            _structure_to_pdb_file: get the file path for the given pdb object
        """
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        shock_id = self._get_pdb_shock_id(params['input_ref'])
        file_path = self.dfu.shock_to_file({
            'shock_id':
            shock_id,
            'file_path':
            params['destination_dir'],
            'unpack':
            'uncompress'
        })['file_path']

        return {'file_path': file_path}

    def export_pdb_structures(self, params):
        """
            export_pdb_structures: return the shock_ids of the ProteinStructures object
        """
        if 'input_ref' not in params:
            raise ValueError("'input_ref' not in supplied params")

        model_pdbs = []
        exp_pdbs = []
        # shock_ids = []
        for m_pdb in model_pdbs:
            pass
        for e_pdb in exp_pdbs:
            pass

        return {'shock_id': self._get_pdb_shock_id(params['input_ref'])}

    def batch_import_pdbs(self, params):
        """
            batch_import_pdbs: upload two sets of pdb files and create a
                                   KBaseStructure.ProteinStructures object
            required params:
                metadata_staging_file_path: a metafile from the user's staging area that must be a
                    subdirectory file path in staging area,
                    e.g., /data/bulk/user_name/metadata_staging_file_path
                          staging_file_subdir_path is metadata_staging_file_path
                structures_name: name of the ProteinStructures object to be generated
                workspace_name: workspace name that the protein structure(s) will be saved
            return:
                structures_ref: return ProteinStructures object reference
                report_name: name of generated report (if any)
                report_ref: report reference (if any)

            1. call _validate_batch_import_pdbs_params to validate input params
            2. call _parse_metadata to parse for model_pdb_files, exp_pdb_files and kbase_meta_data
            3. call import_model_pdb_file on each entry in model_pdb_paths, and
               call import_experiment_pdb_file on each entry in exp_pdb_paths
            4. assemble the data for a ProteinStructures and save the data object
            5. call _generate_batch_report to generate a report for batch_import_pdbs' result
        """

        (metadata_file_path, workspace_name,
         structures_name) = self._validate_batch_import_pdbs_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name
        params['workspace_id'] = workspace_id

        (pdb_file_paths, narrative_ids, genome_names,
         feature_ids) = self._parse_metadata_file(metadata_file_path,
                                                  workspace_id)

        model_pdb_objects = list()
        exp_pdb_objects = list()
        pdb_infos = list()
        successful_files = list()
        failed_files = list()
        protein_structures = dict()
        total_structures = 0

        pdb_params = {}
        # loop through the list of pdb_file_paths
        for pdb in pdb_file_paths:
            pdb_params['pdb_info'] = pdb
            pdb_params['input_staging_file_path'] = pdb['file_path']
            pdb_params['input_file_path'] = None
            pdb_params['input_shock_id'] = None
            pdb_params['workspace_name'] = workspace_name
            pdb_params['structure_name'] = pdb['structure_name']

            if pdb['is_model']:
                model_pdb_data, pdb_info = self.import_model_pdb_file(
                    pdb_params, False)
                if model_pdb_data:
                    model_pdb_objects.append(model_pdb_data)
                    pdb_infos.append(pdb_info)
                    successful_files.append(pdb['file_path'])
                    total_structures += 1
                else:
                    failed_files.append(pdb['file_path'])
            else:
                exp_pdb_data, pdb_info = self.import_experiment_pdb_file(
                    pdb_params, False)
                if exp_pdb_data:
                    exp_pdb_objects.append(exp_pdb_data)
                    pdb_infos.append(pdb_info)
                    successful_files.append(pdb['file_path'])
                    total_structures += 1
                else:
                    failed_files.append(pdb['file_path'])

        if not model_pdb_objects:
            logging.info("No model pdb structure was created/saved!")
            return {}

        protein_structures['model_structures'] = model_pdb_objects
        protein_structures['experimental_structures'] = exp_pdb_objects
        protein_structures['total_structures'] = total_structures
        protein_structures['description'] = (
            f'Created {total_structures} '
            f'structures in {structures_name}')
        logging.info(
            f'ProteinStructures data structure to be saved:\n{protein_structures}'
        )
        returnVal = {}
        try:
            info = self.dfu.save_objects({
                'id':
                workspace_id,
                'objects': [{
                    'type': 'KBaseStructure.ProteinStructures',
                    'name': structures_name,
                    'data': protein_structures
                }]
            })[0]
        except (RuntimeError, TypeError, KeyError, ValueError,
                WorkspaceError) as e:
            err_msg = f'DFU.save_objects errored with message: {e.message} and data: {e.data}'
            logging.info(err_msg)
            raise ValueError(err_msg)
        else:
            structs_ref = f"{info[6]}/{info[0]}/{info[4]}"
            returnVal = {'structures_ref': structs_ref}
            report_output = self._generate_batch_report(
                workspace_name, structs_ref, structures_name, pdb_infos,
                failed_files)
            returnVal.update(report_output)
        finally:
            return returnVal
コード例 #17
0
class poolfileuploadUtil:
    def __init__(self, params):
        self.params = params
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.dfu = DataFileUtil(self.callback_url)
        self.data_folder = os.path.abspath("/kb/module/data/")
        # This is where files from staging area exist
        self.staging_folder = os.path.abspath("/staging/")
        self.shared_folder = params["shared_folder"]
        self.scratch_folder = os.path.join(params["shared_folder"], "scratch")

    def upload_poolfile(self):
        """
        The upload method

        We perform a number of steps:
        Get name of poolfile as it is in staging.
        Find the poolfile in /staging/poolfile_name
        Get the output name for the poolfile
        Get the column headers for the pool file for
            data and testing purposes. Should be len 12.
        Test if poolfile is well-formed.
        We send the file to shock using dfu.
        We get the handle and save the object with all
            the necessary information- including related genome.

        """
        print("params: ", self.params)
        self.validate_import_poolfile_from_staging_params()

        # Name of file in staging:
        staging_pool_fp_name = self.params["staging_file_name"]

        # Output name of pool file:
        poolfile_name = self.params["output_name"]

        print("poolfile_name: ", poolfile_name)
        print("top dir /:", os.listdir("/"))
        print("/kb/module/:", os.listdir("/kb/module"))
        if not os.path.exists(self.staging_folder):
            raise Exception("Staging dir does not exist yet! Error will be thrown")
        else:
            print("Succesfully recognized staging directory")
        # This is the path to the pool file
        poolfile_fp = os.path.join(self.staging_folder, staging_pool_fp_name)
        # We check correctness of pool file
        column_header_list, num_lines = self.check_pool_file(poolfile_fp)
        if len(column_header_list) != 12:
            print(
                "WARNING: Number of columns is not 12 as expected: {}".format(
                    len(column_header_list)
                )
            )
        # We copy the file from staging to scratch
        new_pool_fp = os.path.join(self.shared_folder, poolfile_name)
        shutil.copyfile(poolfile_fp, new_pool_fp)
        poolfile_fp = new_pool_fp
        # We create the handle for the object:
        file_to_shock_result = self.dfu.file_to_shock(
            {"file_path": poolfile_fp, "make_handle": True, "pack": "gzip"}
        )
        # The following var res_handle only created for simplification of code
        res_handle = file_to_shock_result["handle"]

        # We create a better Description by adding date time and username
        date_time = datetime.datetime.utcnow()
        #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format(
        #        self.params['username'], str(date_time))
        fastq_refs = []

        # We create the data for the object
        pool_data = {
            "file_type": "KBasePoolTSV.PoolFile",
            "poolfile": res_handle["hid"],
            # below should be shock
            "handle_type": res_handle["type"],
            "shock_url": res_handle["url"],
            "shock_node_id": res_handle["id"],
            "compression_type": "gzip",
            "column_header_list": column_header_list,
            "num_lines": str(num_lines),
            "fastqs_used": fastq_refs,
            "file_name": res_handle["file_name"],
            "utc_created": str(date_time),
            "related_genome_ref": self.params["genome_ref"],
            "related_organism_scientific_name": self.get_genome_organism_name(
                self.params["genome_ref"]
            ),
            "description": "Manual Upload: " + self.params["description"],
        }

        # To get workspace id:
        ws_id = self.params["workspace_id"]
        save_object_params = {
            "id": ws_id,
            "objects": [
                {
                    "type": "KBasePoolTSV.PoolFile",
                    "data": pool_data,
                    "name": poolfile_name,
                }
            ],
        }
        # save_objects returns a list of object_infos
        dfu_object_info = self.dfu.save_objects(save_object_params)[0]
        print("dfu_object_info: ")
        print(dfu_object_info)
        return {
            "Name": dfu_object_info[1],
            "Type": dfu_object_info[2],
            "Date": dfu_object_info[3],
        }

    def validate_import_poolfile_from_staging_params(self):
        # check for required parameters
        for p in [
            "username",
            "staging_file_name",
            "genome_ref",
            "description",
            "output_name"
        ]:
            if p not in self.params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def check_pool_file(self, poolfile_fp):
        """
        We check the pool file by initializing into dict format

        The function "init_pool_dict" runs the tests to see if the file is
        correct.
        """
        col_header_list = []
        # Parse pool file and check for errors
        test_vars_dict = {"poolfile": poolfile_fp, "report_dict": {"warnings": []}}
        try:
            col_header_list, num_lines = self.init_pool_dict(test_vars_dict)
        except Exception:
            logging.warning(
                "Pool file seems to have errors - " + "Please check and reupload."
            )
            raise Exception
        return [col_header_list, num_lines]

    def init_pool_dict(self, vars_dict):

        # pool dict is rcbarcode to [barcode, scaffold, strand, pos]
        pool = {}
        num_lines = 0
        with open(vars_dict["poolfile"], "r") as f:
            header_str = f.readline()
            if header_str == '':
                raise Exception("Issue with pool file - first line empty")
            num_lines += 1
            column_header_list = [x.strip() for x in header_str.split("\t")]
            crnt_line = f.readline() 
            while crnt_line != '':
                num_lines += 1
                crnt_line.rstrip()
                pool = self.check_pool_line_and_add_to_pool_dict(
                    crnt_line, pool, vars_dict
                )
                crnt_line = f.readline()
        if len(pool.keys()) == 0:
            raise Exception("No entries in pool file")
        return [column_header_list, num_lines]

    def check_pool_line_and_add_to_pool_dict(self, pool_line, pool, vars_dict):
        """
        For a pool line to be correct it has to follow a few rules.

        We care about the first 7 columns of each pool line.
        The first line in the file is the headers, and the first 7 are
        barcode, rcbarcode, nTot, n, scaffold, strand, pos
        Both the barcodes and rcbarcodes must be entirely made up of
        characters from "ACTG". Position must be made up of any number
        of digits (including 0). Strand is from "+","-","".
        If the rcbarcode already exists in the pool, then there is a
        problem with the pool file. Each rcbarcode must be unique.
        """
        # We get first 7 columns of pool_line (out of 12)
        split_pool_line = pool_line.split("\t")[:7]
        # We remove spaces:
        for x in split_pool_line:
            x.replace(" ", "")
        if len(split_pool_line) >= 7:
            # We unpack
            (
                barcode,
                rcbarcode,
                undef_1,
                undef_2,
                scaffold,
                strand,
                pos,
            ) = split_pool_line
        else:
            warning_text = "pool file line with less than 7 tabs:\n{}".format(pool_line)
            vars_dict["report_dict"]["warnings"].append(warning_text)
            logging.warning(warning_text)
            barcode = "barcode"

        if barcode == "barcode":
            # Header line
            pass
        else:
            if not re.search(r"^[ACGT]+$", barcode):
                logging.debug(len(barcode))
                raise Exception("Invalid barcode: |{}|".format(barcode))
            if not re.search(r"^[ACGT]+$", rcbarcode):
                raise Exception("Invalid rcbarcode: |{}|".format(rcbarcode))
            if not (pos == "" or re.search(r"^\d+$", pos)):
                raise Exception("Invalid position: |{}|".format(pos))
            if not (strand == "+" or strand == "-" or strand == ""):
                raise Exception("Invalid strand: |{}|".format(strand))
            if rcbarcode in pool:
                raise Exception("Duplicate rcbarcode.")
            pool[rcbarcode] = [barcode, scaffold, strand, pos]
        return pool

    def get_genome_organism_name(self, genome_ref):
        # Getting the organism name using WorkspaceClient
        ws = self.params['ws_obj'] 
        res = ws.get_objects2(
            {
                "objects": [
                    {
                        "ref": genome_ref,
                        "included": ["scientific_name"],
                    }
                ]
            }
        )
        scientific_name = res["data"][0]["data"]["scientific_name"]
        return scientific_name
コード例 #18
0
class StrainInfo:
    def __init__(self, config):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.sampleservice_util = SampleServiceUtil(config)

    def _sampleset_to_strain_info(self, sample_set_ref, vcf_strain_ids):
        '''
        :param sample_set_ref:
        :param vcf_strain_ids:
        :return: StrainInfo
        order of StrainInfo should be same as order of vcf_strain_ids
        '''
        sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref]
                                           })['data'][0]['data']
        samples = sample_set['samples']
        sample_dict = {}
        for sample in samples:
            name = sample['name']
            sample_dict[name] = {
                "name": name,
                "sample_id": sample['id'],
                "version": sample['version']
            }
        StrainInfo = []
        missing_strains = []
        duplicated_strains = []
        seen_strain = {}
        for strain in vcf_strain_ids:
            if strain in seen_strain:
                duplicated_strains.append(strain)
            else:
                seen_strain[strain] = 1

            if strain not in sample_dict:
                missing_strains.append(strain)
            else:
                StrainInfo.append(sample_dict[strain])

        dup_strains = ", ".join(duplicated_strains)
        if duplicated_strains:
            raise ValueError(
                f'duplicated strain ids need to be fixed in vcf file - {dup_strains}'
            )
        if missing_strains:
            strains_not_found = ", ".join(missing_strains)
            raise ValueError(
                f'Missing strains from sample set {strains_not_found}')

        return (StrainInfo)

    def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref,
                                         obj_name, ws_id):
        am_data = self.sampleservice_util.sample_set_to_attribute_mapping(
            sample_set_ref)
        unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
        if unmatched_ids:
            name = "Column"
            raise ValueError(
                f"The following {name} IDs from the uploaded matrix do not match "
                f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                f"\nPlease verify the input data or upload an excel file with a"
                f"{name} mapping tab.")

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": am_data,
                "name": obj_name
            }]
        })[0]
        sample_attribute_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(
            info[4])
        return (sample_attribute_ref)

    def sample_strain_info(self, params):
        vcf_strain_ids = params["vcf_strain_ids"]
        sample_set_ref = params["sample_set_ref"]
        ws_id = params["ws_id"]
        obj_name = params["sample_attribute_name"]

        sample_attribute_ref = self._sample_set_to_attribute_mapping(
            vcf_strain_ids, sample_set_ref, obj_name, ws_id)
        strains = self._sampleset_to_strain_info(sample_set_ref,
                                                 vcf_strain_ids)
        return (sample_attribute_ref, strains)
コード例 #19
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.3.6"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file

        How we compute this stats:

        For each segment (line) in SAM/BAM file:
            we take the first element as `reads_id`
                    the second element as `flag`

            if the last bit (0x1) of flag is `1`:
                we treat this segment as paired end reads
            otherwise:
                we treat this segment as single end reads

            For single end reads:
                if the 3rd last bit (0x8) of flag is `1`:
                    we increment unmapped_reads_count
                else:
                    we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`

            For paired end reads:
                if the 7th last bit (0x40) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_left_reads_count
                    else:
                        we treat this `reads_id` as mapped

                if the 8th last bit ( 0x80) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_right_reads_count
                    else:
                        we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        """
        path, file = os.path.split(bam_file)

        self.__LOGGER.info('Start to generate aligner stats')
        start_time = time.time()

        infile = pysam.AlignmentFile(bam_file, 'r')

        properly_paired = 0
        unmapped_reads_count = 0
        unmapped_left_reads_count = 0
        unmapped_right_reads_count = 0
        mapped_reads_ids = []
        mapped_left_reads_ids = []
        mapped_right_reads_ids = []
        paired = False
        for alignment in infile:
            seg = alignment.to_string().split('\t')
            reads_id = seg[0]
            flag = "0000000" + "{0:b}".format(int(seg[1]))

            if flag[-1] == '1':
                paired = True

            if paired:  # process paired end sequence

                if flag[-7] == '1':  # first sequence of a pair
                    if flag[-3] == '1':
                        unmapped_left_reads_count += 1
                    else:
                        mapped_left_reads_ids.append(reads_id)

                if flag[-8] == '1':  # second sequence of a pair
                    if flag[-3] == '1':
                        unmapped_right_reads_count += 1
                    else:
                        mapped_right_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1
            else:  # process single end sequence
                if flag[-3] == '1':
                    unmapped_reads_count += 1
                else:
                    mapped_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1

        infile.close()

        if paired:
            mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids
            unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count

            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

            properly_paired = properly_paired // 2

        else:
            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

        try:
            alignment_rate = round(
                float(mapped_reads_count) / total_reads * 100, 3)
        except ZeroDivisionError:
            alignment_rate = 0

        if alignment_rate > 100:
            alignment_rate = 100.0

        elapsed_time = time.time() - start_time
        self.__LOGGER.info('Used: {}'.format(
            time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

        stats_data = {
            "alignment_rate": alignment_rate,
            "mapped_reads": mapped_reads_count,
            "multiple_alignments": multiple_alignments,
            "properly_paired": properly_paired,
            "singletons": singletons,
            "total_reads": total_reads,
            "unmapped_reads": unmapped_reads_count
        }
        return stats_data

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment
                How we compute BAM stats:
                For each segment (line) in SAM/BAM file:
                    we take the first element as `reads_id`
                            the second element as `flag`
                    if the last bit (0x1) of flag is `1`:
                        we treat this segment as paired end reads
                    otherwise:
                        we treat this segment as single end reads
                    For single end reads:
                        if the 3rd last bit (0x8) of flag is `1`:
                            we increment unmapped_reads_count
                        else:
                            we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`
                    For paired end reads:
                        if the 7th last bit (0x40) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_left_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        if the 8th last bit ( 0x80) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_right_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_BAI, False):
                bai_file = file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_SAM, False):
                sam_file = file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref -
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.items():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
コード例 #20
0
class poolcountfileuploadUtil:
    def __init__(self, params):
        self.params = params
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.dfu = DataFileUtil(self.callback_url)
        self.data_folder = os.path.abspath("/kb/module/data/")
        # This is where files from staging area exist
        self.staging_folder = os.path.abspath("/staging/")
        self.shared_folder = params["shared_folder"]
        self.scratch_folder = os.path.join(params["shared_folder"], "scratch")

    def upload_poolcountfile(self):
        """
        The upload method

        We perform a number of steps:
        Get name of poolcount file as it is in staging.
        Find the poolcount file in /staging/poolcount_name
        Get the output name for the poolcount file
        Get the column headers for the pool count file for
            data and testing purposes. 
        Test if poolcount file is well-formed.
        NOTE: We use output_name as set_name - it is important that
            these are equivalent!!!!!
        We send the file to shock using dfu.
        We get the handle and save the object with all
            the necessary information- including related genome.
        params should include:
            output_name,
            staging_file_name,
            ws_obj,
            workspace_id,
        """
        print("params: ", self.params)
        self.validate_import_file_from_staging_params()

        # Name of file in staging (Not path):
        staging_fp_name = self.params["staging_file_name"]

        # Output name of poolcount file:
        poolcount_name = self.params["output_name"]

        print("Output pool count name: ", poolcount_name)
        if not os.path.exists(self.staging_folder):
            raise Exception("Staging dir does not exist yet!")
        else:
            print("Succesfully recognized staging directory")

        # This is the path to the pool file in staging
        poolcount_fp = os.path.join(self.staging_folder, staging_fp_name)
        # We check correctness of pool file in staging
        column_header_list, num_lines = self.check_poolcount_file(poolcount_fp)

        # We copy the file from staging to scratch
        new_pc_fp = os.path.join(self.shared_folder, poolcount_name)
        shutil.copyfile(poolcount_fp, new_pc_fp)
        #poolcount_scratch_fp is location of pool file in scratch
        poolcount_scratch_fp = new_pc_fp

        # We create the KBase handle for the object:
        file_to_shock_result = self.dfu.file_to_shock({
            "file_path": poolcount_scratch_fp,
            "make_handle": True,
            "pack": "gzip"
        })
        # The following var res_handle only created for simplification of code
        res_handle = file_to_shock_result["handle"]

        # Keep track of our own datetime
        date_time = datetime.datetime.utcnow()
        #new_desc = "Uploaded by {} on (UTC) {} using Uploader. User Desc: ".format(
        #        self.params['username'], str(date_time))
        fastq_refs = []

        # We create the data for the object
        poolcount_data = {
            "file_type":
            "KBasePoolTSV.PoolCount",
            "poolcount":
            res_handle["hid"],
            # below should be shock
            "handle_type":
            res_handle["type"],
            "shock_url":
            res_handle["url"],
            "shock_node_id":
            res_handle["id"],
            "compression_type":
            "gzip",
            "column_header_list":
            column_header_list,
            "fastqs_used":
            fastq_refs,
            "file_name":
            res_handle["file_name"],
            "utc_created":
            str(date_time),
            "set_name":
            self.params['output_name'],
            "num_lines":
            str(num_lines),
            "related_genome_ref":
            self.params["genome_ref"],
            "related_organism_scientific_name":
            self.get_genome_organism_name(self.params["genome_ref"]),
            "description":
            "Manual Upload: " + self.params["description"],
        }

        # To get workspace id:
        ws_id = self.params["workspace_id"]
        save_object_params = {
            "id":
            ws_id,
            "objects": [{
                "type": "KBasePoolTSV.PoolCount",
                "data": poolcount_data,
                "name": self.params['output_name'],
            }],
        }
        # save_objects returns a list of object_infos
        dfu_object_info = self.dfu.save_objects(save_object_params)[0]
        print("dfu_object_info: ")
        print(dfu_object_info)
        return {
            "Name": dfu_object_info[1],
            "Type": dfu_object_info[2],
            "Date": dfu_object_info[3],
        }

    def check_poolcount_file(self, poolcount_fp):
        """
        We check the pool file by initializing into dict format
   
        Currently a weak test- should add more testing capabilities.
        """
        # Expected fields
        exp_f = "barcode rcbarcode scaffold strand pos".split(" ")

        with open(poolcount_fp, "r") as f:
            f_str = f.read()
        f_list = f_str.split('\n')
        num_lines = len(f_list)
        header_line = f_list[0]

        # Dropping f_str from memory
        f_str = None

        if header_line == '':
            raise Exception("File format incorrect: " + poolcount_fp)

        fields = header_line.split("\t")

        if not (len(fields) >= 6):
            raise Exception("Too few fields in " + poolcount_fp)
        for i in range(len(exp_f)):
            if not fields[i] == exp_f[i]:
                raise Exception("Expected {} but field is {}".format(
                    exp_f[i], fields[i]))
        return [fields, num_lines]

    def validate_import_file_from_staging_params(self):
        # check for required parameters
        for p in [
                "username", "staging_file_name", "genome_ref", "description",
                "output_name", "ws_obj", "workspace_id"
        ]:
            if p not in self.params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def get_genome_organism_name(self, genome_ref):
        # Getting the organism name using WorkspaceClient
        ws = self.params['ws_obj']
        res = ws.get_objects2({
            "objects": [{
                "ref": genome_ref,
                "included": ["scientific_name"],
            }]
        })
        scientific_name = res["data"][0]["data"]["scientific_name"]
        return scientific_name
コード例 #21
0
class DataUtil:

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')
        constraints = {}

        for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'):
            constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n")
                                if line.startswith(f'@{tag}')]

        return constraints

    def _filter_constraints(self, constraints, data):
        """filters out constraints with missing keys"""
        contains_constraints = constraints.get('contains')

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint[1:]
            missing_key = True
            for in_value in in_values:
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)

        return constraints

    def _retrieve_value(self, data, value):
        """Parse the provided 'data' object to retrieve the item in 'value'."""
        logging.info('Getting value for {}'.format(value))
        retrieve_data = []
        m_data = DotMap(data)
        if value.startswith('set('):
            retrieve_data = value[4:-1].split(",")
        elif value.startswith('values('):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp.values())
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref,
                                                       'included': [included]}]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20]))

        return retrieve_data

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """

        validated = True
        failed_constraints = defaultdict(list)

        unique_constraints = constraints.get('unique')
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint[0])
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint[0])

        contains_constraints = constraints.get('contains')
        for contains_constraint in contains_constraints:
            value = contains_constraint[0]
            in_values = contains_constraint[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(" ".join(contains_constraint))

        conditional_constraints = constraints.get('conditionally_required')
        for conditional_constraint in conditional_constraints:
            trigger = conditional_constraint[0]
            required_keys = conditional_constraint[1:]
            if trigger in data:
                missing_keys = [key for key in required_keys if key not in data]
                if missing_keys:
                    validated = False
                    failed_constraints['conditionally_required'].append(
                        (trigger, required_keys, missing_keys))

        return validated, failed_constraints

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _raise_validation_error(params, validate):
        """Raise a meaningful error message for failed validation"""
        logging.error('Data failed type checking')
        failed_constraints = validate.get('failed_constraints')
        error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))]
        if failed_constraints.get('unique'):
            unique_values = failed_constraints.get('unique')
            error_msg.append('Object should have unique field: {}'.format(unique_values))
        if failed_constraints.get('contains'):
            contained_values = failed_constraints.get('contains')
            for contained_value in contained_values:
                subset_value = contained_value.split(' ')[0]
                super_value = ' '.join(contained_value.split(' ')[1:])
                if 'col_mapping' in super_value:
                    error_msg.append('Column attribute mapping instances should contain all '
                                     'column index from original data')

                if 'row_mapping' in super_value:
                    error_msg.append('Row attribute mapping instances should contain all row '
                                     'index from original data')

                error_msg.append('Object field [{}] should contain field [{}]'.format(
                    super_value,
                    subset_value))
        for failure in failed_constraints.get('conditionally_required', []):
            error_msg.append('If object field "{}" is present than object field(s) {} should '
                             'also be present. Object is missing {}'.format(*failure))
        raise ValueError('\n'.join(error_msg))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.serviceWizardURL = config['srv-wiz-url']
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.generics_service = GenericsService(self.serviceWizardURL)
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    def list_generic_types(self, params=None):
        """
        *Not yet exposed in spec*
        list_generic_types: lists the current valid generics types

        arguments:
            none

        return:
            A list of generic types in the current environment
        """
        returnVal = [x['type_def'] for module in GENERICS_MODULES
                     for x in self.wsClient.get_all_type_info(module)]
        return returnVal

    def fetch_data(self, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        return self.generics_service.fetch_data(params)

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        return {'validated': validated,
                'failed_constraints': failed_constraints}

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        logging.info('Starting validating and saving object data')

        obj_type = params.get('obj_type').split('-')[0]

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({'mod': module_name}).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').items() if v)
        validate = self.validate_data({'obj_type': obj_type,
                                       'data': data})

        if not validate.get('validated'):
            self._raise_validation_error(params, validate)

        # make sure users with shared object have access to the handle file upon saving
        handle = data.get('sequencing_file_handle')
        if handle:
            output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            logging.info('Downloading consensus sequence file in {}'.format(output_directory))
            self._mkdir_p(output_directory)
            matrix_fasta_file = self.dfu.shock_to_file({
                'handle_id': handle,
                'file_path': self.scratch}).get('file_path')
            logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file))
            handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file,
                                                'make_handle': True})['handle']['hid']
            data['sequencing_file_handle'] = handle_id

        # cast data
        int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff']
        for data_name in int_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to int'.format(data_name))
                    data[data_name] = int(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be an integer value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff']
        for data_name in float_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to float'.format(data_name))
                    data[data_name] = float(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be a float value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        ws_name_id = params.get('workspace_id')
        workspace_name = params.get('workspace_name')
        if not ws_name_id:
            if not isinstance(workspace_name, int):
                ws_name_id = self.dfu.ws_name_to_id(workspace_name)
            else:
                ws_name_id = workspace_name

        try:
            logging.info('Starting saving object via DataFileUtil')
            info = self.dfu.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data": data,
                    "name": params.get('obj_name')
                }]
            })[0]
        except Exception:
            logging.info('Saving object via DataFileUtil failed')
            logging.info('Starting saving object via WsLargeDataIO')
            data_path = os.path.join(self.scratch,
                                     params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json")
            json.dump(data, open(data_path, 'w'))

            info = self.ws_large_data.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data_json_file": data_path,
                    "name": params.get('obj_name')
                }]
            })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
コード例 #22
0
    def MotifEnsemble(self, ctx, params):
        """
        :param params: instance of type "EnsembleParams" (Internal workflow:
           1. Input - list of motifsets , workspace, threshold consensus 2.
           Download MotifSets -> Utils function 3. Assign motif ids by
           position in list Use refs to identify MSOs internally! Dictionary
           of motifsets key: ref, val set list of match sets: each item in
           the set is a tuple of (ref,index) for each motifset: <- enumerate
           to avoid duplicate for each motif in motifset for each other
           motifset: <- enumerate to avoid duplicate for each motif in other:
           compare(motif1,motif2): if motifs same: search list of sets for
           motif1: if found add  motif2 if not in if not found search list of
           sets for motif2: if found add motif1 else add a new set with
           motif1 + motif2) -> structure: parameter "motifset_refs" of list
           of String, parameter "workspace_name" of String, parameter
           "threshold" of Double
        :returns: instance of type "Ensemble_out" -> structure: parameter
           "motifset_ref" of String
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN MotifEnsemble
        #TODO: ERROR CHECK (MULTIPLE MOTIFSETS, NONEMPTY, SSREF are the same, etc.)

        MotifSetDict = DownloadMotifSet(params['motifset_refs'],self.callback_url)

        matchSets = []
        threshold = float(params['threshold'])

        for i,MSR1 in enumerate(MotifSetDict.keys()):
            for j,motif1 in enumerate(MotifSetDict[MSR1]['Motifs']):
                for k,MSR2 in enumerate(MotifSetDict.keys()):
                    if k > i:
                        for l,motif2 in enumerate(MotifSetDict[MSR2]['Motifs']):
                            if CompareMotifsBP(motif1,motif2,threshold):
                                found1 = False
                                found2 = False
                                index1 = -1
                                index2 = -1
                                for m,mset in enumerate(matchSets):
                                    if (MSR1,j) in mset:
                                        found1 = True
                                        index1 = m
                                    if(MSR2,l) in mset:
                                        found2 = True
                                        index2 = m
                                if not found1 and found2:
                                    matchSets[index2].add((MSR1,j))
                                elif not found2 and found1:
                                    matchSets[index1].add((MSR2,l))
                                elif found1 and found2:
                                    if index1 != index2:
                                        matchSets[index1].union(matchSets[index2])
                                        matchSets.pop(index2)
                                else:
                                    matchSets.append(set([(MSR1,j),(MSR2,l)]))
        numMotifSets = len(params['motifset_refs'])
        threshold = float(params['proportion'])
        KeepSets = []
        print('NUM MATCHSETS********')
        print(len(matchSets))
        for i,mset in enumerate(matchSets):
            uniqueRefs = {}
            for tuple in mset:
                if tuple[0] not in uniqueRefs:
                    uniqueRefs[tuple[0]] = tuple[0]
            if float(len(uniqueRefs.keys()))/numMotifSets >= threshold:
                KeepSets.append(i)
        print(len(KeepSets))


        #handle duplicates...
        #for i,tuple1 in enumerate(matchSets):
        #    for j,tuple2 in enumerate(matchSets):
        #        if j > i:
        #            if tuple1[0] == tuple2[0]:
                        #handle this....
                        #how...?
                        #merge locations if theyre different
                        #pick one motif by default(p-val)
                        #run motif compare to ensure theyre actually similar enough
        #                print('duplicate')

        #create new MSO
        ESO = {}
        for ref in MotifSetDict:
            ESO['Condition'] = MotifSetDict[ref]['Condition']
            ESO['SequenceSet_ref'] = MotifSetDict[ref]['SequenceSet_ref']
            ESO['Alphabet'] = deepcopy(MotifSetDict[ref]['Alphabet'])
            ESO['Background'] = deepcopy(MotifSetDict[ref]['Background'])
            break
        ESO['Motifs'] = []
        #Add motifs
        for keep in KeepSets:
            motif = merge(matchSets[keep],MotifSetDict)
            ESO['Motifs'].append(deepcopy(motif))


        #upload new MSO
        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['id'] = params['workspace_name']
        save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : ESO , 'name' : 'EnsembleMotifSet'}]

        info = dfu.save_objects(save_objects_params)[0]
        obj_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #create report
        htmlDir = self.shared_folder + '/ensemble_html'
        os.mkdir(htmlDir)
        MakeReport(htmlDir,ESO)


        try:
            html_upload_ret = dfu.file_to_shock({'file_path': htmlDir ,'make_handle': 0, 'pack': 'zip'})
        except:
            raise ValueError ('error uploading HTML file to shock')



        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'MEMEMotifFinder_report_'+str(uuid.uuid4())

        reportObj = {'objects_created': [{'ref' : obj_ref, 'description' : 'Motif Set generated by MEME'}],
                     'message': '',
                     'direct_html': None,
                     'direct_html_link_index': 0,
                     'file_links': [],
                     'html_links': [],
                     'html_window_height': 220,
                     'workspace_name': params['workspace_name'],
                     'report_object_name': reportName
                     }


        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                    #'name': 'promoter_download.zip',
                                    'name': 'index.html',
                                    'label': 'Save promoter_download.zip'
                                    }
                                   ]


        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        out = { 'report_name': report_info['name'], 'report_ref': report_info['ref'] }

        #END MotifEnsemble

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method MotifEnsemble return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
コード例 #23
0
class IntegrateAppImpl:
    @staticmethod
    def _validate_params(params, required, optional=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        required = set(required)
        optional = set(optional)
        pkeys = set(params)
        if required - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(required - pkeys)))
        defined_param = required | optional
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def _build_figure(self, file_path, figure_matrix):

        # Make figure matrix html file and embed
        file_name = 'integrated_scatterplot_output.html'
        figure_html_path = os.path.join(file_path, file_name)
        output_file(figure_html_path)
        save(grid(figure_matrix))

        return file_name

    def _build_table(self, table_dict, stats_df):

        html_lines = list()
        html_lines.append('<table class="table table-bordered table-striped">')

        header_list = [
            "Enzymes", "Compartments", "Reactions", "EC numbers", "Subsystems"
        ] + self.conditions_ids + ["Mahalanobis distance", "p-value"]

        html_lines.append('<thead>')
        internal_header_line = "</td><td>".join(header_list)
        html_lines.append('<tr><td>' + internal_header_line + '</td></tr>')
        html_lines.append('</thead>')

        html_lines.append("<tbody>")
        print_row = True
        for complex_row in sorted(table_dict.keys()):
            print_row = True
            cpts = ", ".join(sorted(list(table_dict[complex_row])))

            ecs = []
            subsystems = []
            reactions = []
            conditions = []
            mahal_list = []
            pvalue_list = []
            mahalanobis_dist = "0.00"
            pvalue = "0.00"
            for cpt in table_dict[complex_row]:
                for rxn in table_dict[complex_row][cpt]:

                    if (rxn not in reactions):
                        reactions.append(rxn)

                    if (len(conditions) == 0):
                        conditions = table_dict[complex_row][cpt][rxn]

                    if (rxn in self.reactions_data):
                        for ss in self.reactions_data[rxn]['subsystems']:
                            ss = ss.replace("_", " ")
                            ss = ss.replace(" in plants", "")
                            if (ss not in subsystems):
                                subsystems.append(ss)

                        for ec in self.reactions_data[rxn]['ecs']:
                            if (ec not in ecs):
                                ecs.append(ec)

                    str_md = "0.00"
                    str_pv = "0.00"
                    if (rxn + '_' + cpt not in stats_df.index):
                        print("MISSING REACTION: ", complex_row,
                              rxn + "_" + cpt)
                        print_row = False
                    else:
                        str_md = "{0:.2f}".format(
                            stats_df.loc[rxn + '_' + cpt]['mahalanobis'])
                        str_pv = "{0:.2f}".format(stats_df.loc[rxn + '_' +
                                                               cpt]['pvalue'])
                        if (str_pv == "0.00"):
                            str_pv = "{0:.2e}".format(
                                stats_df.loc[rxn + '_' + cpt]['pvalue'])
                        if (mahalanobis_dist != "0.00"
                                and str_md != mahalanobis_dist):
                            print(
                                "WARNING: CHANGING STATS FOR SAME PROTEIN COMPLEXES\n"
                            )
                            print(
                                "===================================================\n\n"
                            )
                            print(complex_row, cpts, rxn, conditions,
                                  stats_df.loc[rxn + '_' + cpt]['mahalanobis'],
                                  mahalanobis_dist, "\n")
                            print(
                                "===================================================\n\n"
                            )

                    mahalanobis_dist = str_md
                    pvalue = str_pv

            reactions = ", ".join(sorted(reactions))
            subsystems = ", ".join(sorted(subsystems))
            ecs = ", ".join(sorted(ecs))

            conditions_strings = list()
            for i in range(len(conditions)):
                conditions[i][0] = "{0:.2f}".format(conditions[i][0])
                conditions_strings.append(" | ".join(conditions[i]))

            # some complexes may have zero features predicted
            if (print_row is True):
                html_lines.append("<tr>")
                internal_row_line = "</td><td>".join(
                    [complex_row, cpts, reactions, ecs, subsystems] +
                    conditions_strings + [mahalanobis_dist, pvalue])
                html_lines.append("<td>" + internal_row_line + "</td>")
                html_lines.append("</tr>")

        html_lines.append("</tbody>")
        html_lines.append("</table>")

        return "\n".join(html_lines)

    def _build_report(self, figure_matrix, table_dict, stats_df,
                      saved_object_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        # Make report directory and copy over files
        report_file_path = os.path.join(self.scratch, self.report_uuid)
        os.mkdir(report_file_path)

        table_html_string = self._build_table(table_dict, stats_df)

        if (len(self.conditions_ids) > 1):
            figure_html_file = self._build_figure(report_file_path,
                                                  figure_matrix)
            output_html_files = self._generate_report_html(
                report_file_path,
                figure_html_file=figure_html_file,
                table_string=table_html_string)
        else:
            output_html_files = self._generate_report_html(
                report_file_path, table_string=table_html_string)

        report_params = {
            'direct_html_link_index':
            0,  #Use to refer to index of 'html_links'
            'workspace_name': workspace_name,
            'report_object_name': 'plant_fba_' + self.report_uuid,
            'objects_created': saved_object_list,
            'html_links': output_html_files
        }

        output = self.kbr.create_extended_report(report_params)

        return {'report_name': output['name'], 'report_ref': output['ref']}

    def _generate_report_html(self,
                              file_path,
                              figure_html_file=None,
                              table_string=None):
        """
            _generate_report: generates the HTML for the upload report
        """
        html_report_list = list()

        ##############################################################
        # Write table html file
        ##############################################################
        # Read in template html
        with open(
                os.path.join(
                    '/kb/module/data', 'app_report_templates',
                    'integrate_abundances_report_tables_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Generate and Insert html title
        title_string = "-".join(
            [self.input_params['input_expression_matrix']] +
            self.conditions_ids)
        report_template_string = report_template_string.replace(
            '*TITLE*', title_string)

        # Insert html table
        table_report_string = report_template_string.replace(
            '*TABLES*', table_string)

        # Write html file
        table_html_file = "integrated_table_output.html"
        with open(os.path.join(file_path, table_html_file), 'w') as table_file:
            table_file.write(table_report_string)

        ##############################################################
        # Write summary index.html file
        ##############################################################
        # Begin composing html
        html_lines = list()
        html_lines.append(
            '<h3 style="text-align: center">Integrate Abundances with Metabolism Report</h3>'
        )
        html_lines.append(
            "<p>The \"Integrate Abundances with Metabolism\" app has finished running.</br>"
        )
        html_lines.append("The app integrated the values from the <b>" +
                          self.input_params['input_expression_matrix'] +
                          "</b> ExpressionMatrix")
        html_lines.append(" with the <b>" +
                          self.input_params['input_fbamodel'] +
                          "</b> FBAModel</br>")
        html_lines.append(
            "Specifically, the app integrated the values from these chosen conditions in the ExpressionMatrix: <b>"
            + "</b>, <b>".join(self.conditions_ids) + "</b></br>")
        html_lines.append(
            "The results of the integration are stored in the <b>" +
            self.input_params['output_reaction_matrix'] +
            "</b> ReactionMatrix.</p><br/>")
        html_lines.append(
            'The results of the integration are also tabulated in this <a href="'
            + table_html_file + '" target="_blank">Table</a></br>')

        if (len(self.conditions_ids) > 1):
            html_lines.append(
                'The results of the integration can be also be visualized in these <a href="'
                + figure_html_file + '" target="_blank">Scatterplots</a>')

        # Read in template html
        with open(
                os.path.join('/kb/module/data', 'app_report_templates',
                             'integrate_abundances_report_template.html')
        ) as report_template_file:
            report_template_string = report_template_file.read()

        # Insert html
        summary_report_string = report_template_string.replace(
            '*TEXT*', "\n".join(html_lines))

        summary_html_file = "index.html"
        with open(os.path.join(file_path, summary_html_file),
                  'w') as index_file:
            index_file.write(summary_report_string)

        ##############################################################
        # Upload files and compose html report object
        ##############################################################
        # Cache it in shock as an archive
        upload_info = self.dfu.file_to_shock({
            'file_path': file_path,
            'pack': 'zip'
        })

        # HTML Link objects
        html_link = dict()
        # Index
        # html_link = {'shock_id' : upload_info['shock_id'],
        #              'name' : summary_html_file,
        #              'label' : 'HTML report for integrate_abundances_with_metabolism app',
        #              'description' : 'HTML report for integrate_abundances_with_metabolism app'}
        # html_report_list.append(html_link)

        if (len(self.conditions_ids) > 1):
            # Figures
            html_link = {
                'shock_id':
                upload_info['shock_id'],
                'name':
                figure_html_file,
                'label':
                'Scatterplot figures generated by Integrate Abundances with Metabolism app',
                'description':
                'Scatterplot figures generated by Integrate Abundances with Metabolism app'
            }
            html_report_list.append(html_link)

        # Table
        html_link = {
            'shock_id':
            upload_info['shock_id'],
            'name':
            table_html_file,
            'label':
            'HTML table generated by Integrate Abundances with Metabolism app',
            'description':
            'HTML table generated by Integrate Abundances with Metabolism app'
        }
        html_report_list.append(html_link)

        return html_report_list

    def _load_fbamodel(self, model_ref):

        model_obj = self.dfu.get_objects({'object_refs':
                                          [model_ref]})['data'][0]
        print("Number of reactions: " +
              str(len(model_obj['data']['modelreactions'])))

        model_reaction_lookup_dict = dict()
        for index in range(len(model_obj['data']['modelreactions'])):
            model_reaction_lookup_dict[model_obj['data']['modelreactions']
                                       [index]['id']] = index

        return [model_obj, model_reaction_lookup_dict]

    def _load_expression_matrix(self, expdata_ref):

        expdata_obj = self.dfu.get_objects({'object_refs':
                                            [expdata_ref]})['data'][0]
        conditions_ids = expdata_obj['data']['data']['col_ids']
        features_ids = expdata_obj['data']['data']['row_ids']

        feature_lookup_dict = dict()
        for index in range(len(features_ids)):
            feature_lookup_dict[features_ids[index]] = index

        condition_lookup_dict = dict()
        for index in range(len(conditions_ids)):
            condition_lookup_dict[conditions_ids[index]] = index

        if (len(self.conditions_ids) == 0):
            self.conditions_ids = conditions_ids

        return [
            expdata_obj, features_ids, feature_lookup_dict,
            condition_lookup_dict
        ]

    def _compile_genome_scores(self, data, conditions_indices):

        Feature_Comparison_Dict = dict()
        for feature_index in range(len(data)):

            scores_dict = dict()
            for condition in self.conditions_ids:
                condition_index = conditions_indices[condition]

                #Retrieve value from 2D matrix
                score = data[feature_index][condition_index]

                #Force into string for easier comparison
                str_score = "{0:.2f}".format(score)

                if (str_score == "0.00"):
                    continue

                scores_dict[condition] = score

            #Here we skip features where there aren't enough scores (should be same number of conditions)
            if (len(scores_dict) < len(self.conditions_ids)):
                continue

            for condition in scores_dict:

                if (condition not in Feature_Comparison_Dict):
                    Feature_Comparison_Dict[condition] = list()

                Feature_Comparison_Dict[condition].append(
                    scores_dict[condition])

        return Feature_Comparison_Dict

    def _compile_model_scores_percentiles(self, data):

        # I want to compute percentile rank for each feature under each condition
        # The Conditions_Score_Dicts variable is used to "bin" identical scores
        # (to two decimal points, can be changed)

        # First, we iterate through the conditions for computing percentile rank
        # for each condition
        model_conditions_score_lists = dict()
        model_conditions_score_pct_dicts = dict()
        for condition_index in range(len(self.conditions_ids)):
            condition = self.conditions_ids[condition_index]

            # For each condition, we "bin" the scores

            score_reaction_dict = dict()
            score_reaction_list = list()
            # The counting of features is done independently because we skip scores of zero
            # (which this affect how percentile rank distributes)
            n_ftrs = 0
            for reaction_index in range(len(data)):

                # Retrieve value from 2D matrix
                score = data[reaction_index][condition_index]

                # Many reactions are not assigned a score, and instead have a default tiny score
                if (score == float(-sys.maxsize - 1)):
                    continue

                # Force into string for easier comparison
                str_score = "{0:.2f}".format(score)

                # I skip the relatively large number of reactions that have a value of zero
                # to prevent the computation of the percentile rank skewing towards zero
                if (str_score == "0.00"):
                    continue

                n_ftrs += 1
                if (str_score not in score_reaction_dict):
                    score_reaction_dict[str_score] = list()
                score_reaction_dict[str_score].append(reaction_index)
                score_reaction_list.append(float(str_score))

            model_conditions_score_lists[condition] = score_reaction_list

            # Then for each condition, we use the binned scores to compute
            # percentile rank
            if (condition not in model_conditions_score_pct_dicts):
                model_conditions_score_pct_dicts[condition] = dict()

            sorted_scores = sorted(score_reaction_dict.keys(), key=float)
            less_than_score_ftrs_count = 0
            for score_index in range(len(sorted_scores)):

                n_score_ftrs = len(
                    score_reaction_dict[sorted_scores[score_index]])
                half_n_score_ftrs = float(n_score_ftrs) * 0.5
                cumulative_n_score_ftrs = float(
                    less_than_score_ftrs_count) + half_n_score_ftrs
                percentile_rank = cumulative_n_score_ftrs / float(n_ftrs)

                less_than_score_ftrs_count += len(
                    score_reaction_dict[sorted_scores[score_index]])
                model_conditions_score_pct_dicts[condition][
                    sorted_scores[score_index]] = percentile_rank

        # This next part of the code is to re-iterate through the data and to compose the dicts
        # that become ColumnDataStores, and also with default values

        # The reaction_percentile_comparison_dict is for the reaction percentile plot
        reaction_percentile_comparison_dict = dict()
        if ('All' not in reaction_percentile_comparison_dict):
            reaction_percentile_comparison_dict['All'] = dict()

        # The reaction_score_comparison_dict works for the genome features plot
        reaction_score_comparison_dict = dict()

        for reaction_index in range(len(data)):

            scores_dict = dict()
            for condition_index in range(len(self.conditions_ids)):
                condition = self.conditions_ids[condition_index]

                #Retrieve value from 2D matrix
                score = data[reaction_index][condition_index]

                #Many reactions are not assigned a score, and instead a default tiny score
                if (score == float(-sys.maxsize - 1)):
                    continue

                scores_dict[condition] = score

            #Here we skip reactions where there aren't enough scores (should be same number of conditions)
            if (len(scores_dict) < len(self.conditions_ids)):
                continue

            for condition in scores_dict:

                # Collect reaction scores
                if (condition not in reaction_score_comparison_dict):
                    reaction_score_comparison_dict[condition] = list()
                reaction_score_comparison_dict[condition].append(
                    scores_dict[condition])

                # Collect reaction percentiles
                if (condition
                        not in reaction_percentile_comparison_dict['All']):
                    reaction_percentile_comparison_dict['All'][
                        condition] = list()

                #Force into string for easier comparison
                str_score = "{0:.2f}".format(scores_dict[condition])

                #We skip zero scores when computing the percentiles
                #So we have to check for them here
                condition_pct = 0.00
                if (str_score != '0.00'):
                    condition_pct = model_conditions_score_pct_dicts[
                        condition][str_score]
                reaction_percentile_comparison_dict['All'][condition].append(
                    condition_pct)

                if ('reactions'
                        not in reaction_percentile_comparison_dict['All']):
                    reaction_percentile_comparison_dict['All'][
                        'reactions'] = list()
                if(self.reactions_ids[reaction_index] not in \
                       reaction_percentile_comparison_dict['All']['reactions']):
                    reaction_percentile_comparison_dict['All'][
                        'reactions'].append(self.reactions_ids[reaction_index])

                base_rxn = self.reactions_ids[reaction_index].split('_')[0]
                for ss in self.reactions_data[base_rxn]['subsystems']:
                    if (ss not in reaction_percentile_comparison_dict):
                        reaction_percentile_comparison_dict[ss] = dict()
                    if (condition
                            not in reaction_percentile_comparison_dict[ss]):
                        reaction_percentile_comparison_dict[ss][
                            condition] = list()
                    reaction_percentile_comparison_dict[ss][condition].append(
                        condition_pct)

                    if ('reactions'
                            not in reaction_percentile_comparison_dict[ss]):
                        reaction_percentile_comparison_dict[ss][
                            'reactions'] = list()
                    if(self.reactions_ids[reaction_index] not in \
                           reaction_percentile_comparison_dict[ss]['reactions']):
                        reaction_percentile_comparison_dict[ss][
                            'reactions'].append(
                                self.reactions_ids[reaction_index])

            self.mh_reactions_ids.append(self.reactions_ids[reaction_index])

        # We set the default values here at the end of the loop because we don't know
        # how many reactions there will be for each category
        for category in reaction_percentile_comparison_dict:
            for key in ['color', 'size', 'tooltip', 'fill_alpha']:
                reaction_percentile_comparison_dict[category][key] = list()

            for index in range(
                    len(reaction_percentile_comparison_dict[category][
                        self.conditions_ids[0]])):

                reaction_percentile_comparison_dict[category][
                    'fill_alpha'].append(1.0)

                # format string of subsystems for tooltip
                rxn = reaction_percentile_comparison_dict[category][
                    'reactions'][index]
                base_rxn = rxn.split('_')[0]
                ss_string = ", ".join(
                    self.reactions_data[base_rxn]['subsystems'])
                reaction_percentile_comparison_dict[category][
                    'tooltip'].append(rxn + ", " + ss_string)

                if (category == 'All'):

                    reaction_percentile_comparison_dict[category][
                        'color'].append('black')
                    reaction_percentile_comparison_dict[category][
                        'size'].append(6)

                else:

                    reaction_percentile_comparison_dict[category][
                        'color'].append('red')
                    reaction_percentile_comparison_dict[category][
                        'size'].append(8)

        return [
            reaction_score_comparison_dict, reaction_percentile_comparison_dict
        ]

    def _compile_mahalanobis_dist_pvalue(self, data, threshold):

        data_df = pd.DataFrame(data,
                               columns=self.conditions_ids,
                               index=self.mh_reactions_ids)

        # I don't know the math well enough to follow what's going on, but I used
        # the recipe described here:
        # https://www.machinelearningplus.com/statistics/mahalanobis-distance/

        # Covariance matrix via numpy
        cov_mat = np.cov(data_df.values.T)

        # Inverse covariance matrix via scipy.linalg
        # It won't accept a 1x1 matrix hence the if/else
        if (len(self.conditions_ids) > 1):
            inv_cov_mat = sp.linalg.inv(cov_mat)
        else:
            inv_cov_mat = 1 / cov_mat

        # two terms required, second using dot product
        data_minus_mean = data_df - np.mean(data_df)
        left_term = np.dot(data_minus_mean, inv_cov_mat)

        # dot product
        mahalanobis = np.dot(left_term, data_minus_mean.T)
        data_df['mahalanobis'] = mahalanobis.diagonal()

        # chi-squared p-values with one degree of freedom (two sets of variables)
        data_df['pvalue'] = 1 - sp.stats.chi2.cdf(data_df['mahalanobis'], 1)

        # find the outliers below a given threshold, i.e. p < 0.01
        outliers = data_df.loc[data_df.pvalue < threshold]
        # this is used when you want to just plot the p-values alone
        data_df.index.name = 'reactions'
        outliers.index.name = 'reactions'

        #Need to return the mapping between reactions and the p-values
        return [data_df, outliers]

    def _integrate_abundances(self, model_obj, feature_lookup_dict,
                              expdata_obj, condition_indices):

        reaction_values_matrix = list()
        reactions_ids = list()
        minmax_expscore_dict = dict()
        model_complexes_dict = dict()
        fh = open(self.scratch + '/output.txt', 'w')
        fh2 = open(self.scratch + '/rxn01486.txt', 'w')
        print_data = False
        for mdlrxn in range(len(model_obj['data']['modelreactions'])):
            mdlrxn_obj = model_obj['data']['modelreactions'][mdlrxn]
            reactions_ids.append(mdlrxn_obj['id'])
            [base_rxn, cpt_id] = mdlrxn_obj['id'].split('_')

            #            if(base_rxn == 'rxn01486' or base_rxn == 'rxn37610'):
            #                print_data=True

            rxndata_row = list()
            for condition in self.conditions_ids:
                if (condition not in minmax_expscore_dict):
                    minmax_expscore_dict[condition] = {
                        'max': -sys.maxsize - 1,
                        'min': sys.maxsize
                    }

                condition_index = condition_indices[condition]

                # Maximal gene expression for a reaction
                reaction_score = ['nan', ""]
                prots_str_list = list()
                for prt in mdlrxn_obj['modelReactionProteins']:

                    # Minimal gene expression for a complex
                    complex_score = ['nan', ""]
                    subs_str_list = list()
                    for sbnt in prt['modelReactionProteinSubunits']:

                        # Maximal gene expression for a subunit
                        subunit_score = ['nan', ""]
                        ftrs_str_list = list()
                        for feature in sbnt['feature_refs']:
                            feature = feature.split('/')[-1]
                            ftrs_str_list.append(feature)
                            feature_index = feature_lookup_dict[feature]

                            ftr_score = expdata_obj['data']['data']['values'][
                                feature_index][condition_index]

                            if (print_data is True):
                                fh2.write(mdlrxn_obj['id'] + ':' + feature +
                                          ':' + str(ftr_score) + '\n')

                            if (ftr_score <
                                    minmax_expscore_dict[condition]['min']):
                                minmax_expscore_dict[condition][
                                    'min'] = ftr_score

                            if (ftr_score >
                                    minmax_expscore_dict[condition]['max']):
                                minmax_expscore_dict[condition][
                                    'max'] = ftr_score

                            # Maximal gene expression for a subunit
                            if (subunit_score[0] == 'nan'
                                    or subunit_score[0] < ftr_score):
                                subunit_score = [ftr_score, feature]

                        if (print_data is True):
                            fh2.write(subunit_score, '\n')

                        ftr_str = "(" + ", ".join(ftrs_str_list) + ")"
                        subs_str_list.append(ftr_str)

                        # Minimal gene expression for a complex
                        if (subunit_score[0] != 'nan'):
                            if (complex_score[0] == 'nan'
                                    or complex_score[0] > subunit_score[0]):
                                complex_score[0] = subunit_score[0]
                                complex_score[1] = subunit_score[1]

                    if (print_data is True):
                        fh2.write(complex_score, '\n')

                    sub_str = "[" + ", ".join(subs_str_list) + "]"
                    prots_str_list.append(sub_str)

                    # Maximal gene expression for a reaction
                    if (complex_score[0] != 'nan'):
                        if (reaction_score[0] == 'nan'
                                or reaction_score[0] < complex_score[0]):
                            reaction_score[0] = complex_score[0]
                            reaction_score[1] = complex_score[1]

                if (reaction_score[0] == 'nan'):
                    reaction_score[0] = float(-sys.maxsize - 1)

                if (print_data is True):
                    fh2.write(condition + ':' + str(reaction_score[0]) + '(' +
                              reaction_score[1] + ')\n')

                #Putting together dict for table
                proteins_string = ', '.join(prots_str_list)
                if (len(prots_str_list) > 0 and proteins_string != "[]"
                        and proteins_string != "[()]"):
                    if (proteins_string not in model_complexes_dict):
                        model_complexes_dict[proteins_string] = dict()
                    if (cpt_id not in model_complexes_dict[proteins_string]):
                        model_complexes_dict[proteins_string][cpt_id] = dict()
                    if (base_rxn not in model_complexes_dict[proteins_string]
                        [cpt_id]):
                        model_complexes_dict[proteins_string][cpt_id][
                            base_rxn] = list()
                    fh.write('\t'.join([
                        condition, proteins_string, cpt_id, base_rxn,
                        str(reaction_score[0]), reaction_score[1], '\n'
                    ]))
                    model_complexes_dict[proteins_string][cpt_id][
                        base_rxn].append(reaction_score)

                rxndata_row.append(reaction_score[0])

            print_data = False

            reaction_values_matrix.append(rxndata_row)

        fh.close()

        self.reactions_ids = reactions_ids
        return (reaction_values_matrix, model_complexes_dict)

    def __init__(self, config, ctx, input_params):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)

        self.scratch = config['scratch']
        self.report_uuid = str(uuid.uuid4())

        # There is a bug in the UI that won't let me collect a
        # a clean list of conditions, so I have to parse them
        # from a comma-separated string
        if ("input_columns" in input_params
                and input_params["input_columns"] != ""):
            conditions = list()
            for condition in input_params["input_columns"].split(','):
                conditions.append(condition)
            input_params["input_columns"] = conditions

        self.input_params = input_params

        # set in _load_expression_matrix()
        self.conditions_ids = list()

        # this is an optional parameter, but restricts the
        # number of chosen columns in the matrix
        if ('input_columns' in input_params
                and len(input_params['input_columns']) > 0):
            self.conditions_ids = input_params['input_columns']

        # set in _integrate_abundances()
        self.reactions_ids = list()

        # set in _compile_model_scores_percentiles
        self.mh_reactions_ids = list()

        with open(
                os.path.join("/kb/module/PlantSEED", "Data/PlantSEED_v3",
                             "PlantSEED_Roles.json")) as plsd_fh:
            PS_Roles = json.load(plsd_fh)

        plantseed = FetchPlantSEEDImpl()
        self.reactions_data = plantseed.fetch_reactions(PS_Roles)

    def integrate_abundances_with_metabolism(self):

        self._validate_params(
            self.input_params, {
                'input_ws', 'input_fbamodel', 'input_expression_matrix',
                'output_reaction_matrix'
            }, {'input_columns'})

        ##############################################################
        # Load model and expression objects
        ##############################################################
        model_ref = self.input_params['input_ws'] + '/' + self.input_params[
            'input_fbamodel']
        [model_obj, reaction_index] = self._load_fbamodel(model_ref)

        # The columns / conditions_ids are set in this function if not set via user parameter
        expression_ref = self.input_params[
            'input_ws'] + '/' + self.input_params['input_expression_matrix']
        [expdata_obj, features_ids, feature_index,
         condition_index] = self._load_expression_matrix(expression_ref)

        ##############################################################
        # Extract expression abundances for use in first scatter plot
        ##############################################################
        feature_comparison_dict = self._compile_genome_scores(
            expdata_obj['data']['data']['values'], condition_index)

        ####################################################################
        # Actually integrate abundances and build new ReactionMatrix object
        ####################################################################
        (reaction_values_matrix,
         model_complexes_dict) = self._integrate_abundances(
             model_obj, feature_index, expdata_obj, condition_index)

        rxndata_obj = {
            'row_ids': self.reactions_ids,
            'col_ids': self.conditions_ids,
            'values': reaction_values_matrix
        }

        ##########################################################################################
        # Extract / organize reaction expression scores for use in first and second scatter plot
        ##########################################################################################
        [reaction_scores_dict, reaction_percentiles_dict
         ] = self._compile_model_scores_percentiles(reaction_values_matrix)

        #############################################################################################################
        # Multi-variate mahalanobis distances computed along with outliers depending on chi-squared p-value of 0.01
        #############################################################################################################
        [mahal_dist_df, outliers] = self._compile_mahalanobis_dist_pvalue(
            reaction_percentiles_dict['All'], 0.01)

        ##############################################################
        # Figure generator
        ##############################################################
        subsystem_select_list = ["None"]
        for category in sorted(list(reaction_percentiles_dict.keys())):
            if (category == 'All'):
                continue
            subsystem_select_list.append(category)

            for rxn_idx in range(
                    len(reaction_percentiles_dict[category]['reactions'])):
                rxn = reaction_percentiles_dict[category]['reactions'][rxn_idx]
                pval = mahal_dist_df.loc[rxn]['pvalue']
                # reaction_percentiles_dict[category]['fill_alpha'][rxn_idx] = 1-pval

        figure_generator = GenerateFigureImpl()
        figure_grid = figure_generator.generate_figure(
            self.conditions_ids,
            category_select=subsystem_select_list,
            genome_features=feature_comparison_dict,
            reaction_scores=reaction_scores_dict,
            reaction_percentiles=reaction_percentiles_dict)

        ##############################################################
        # Finishing and Saving ReactionMatrix
        ##############################################################
        ReactionMatrix_obj = {
            'type': 'KBaseMatrices.ReactionMatrix',
            'name': self.input_params['output_reaction_matrix'],
            'data': {
                'scale': 'raw',
                'description': 'reaction expression score',
                'fbamodel_ref': model_ref,
                'expression_ref': expression_ref,
                'data': rxndata_obj
            }
        }

        ws_id = self.dfu.ws_name_to_id(self.input_params['input_ws'])
        saved_matrix_dict = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [ReactionMatrix_obj]
        })[0]
        saved_matrix_ref = "{}/{}/{}".format(saved_matrix_dict[6],
                                             saved_matrix_dict[0],
                                             saved_matrix_dict[4])
        saved_matrix_desc = "Reaction matrix: " + self.input_params[
            'output_reaction_matrix']

        #####################################################################
        # Building the report with figures, tables, and saved_objects (to be improved)
        # We pass in a dict where each key is a row for the table
        #####################################################################

        output_object_files = list()
        output_object_files.append({
            'ref': saved_matrix_ref,
            'description': saved_matrix_desc
        })

        return self._build_report(figure_grid, model_complexes_dict,
                                  mahal_dist_df, output_object_files,
                                  self.input_params['input_ws'])
コード例 #24
0
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
コード例 #25
0
 def test_update_taxon_assignments_valid(self):
     """
     Test a valid call to the update_taxon_assignments method.
     """
     taxon_key = str(uuid4())
     taxon_val = str(uuid4())
     taxon_val_new = str(uuid4())
     # Copy the object to test workspace
     dfu = DataFileUtil(self.callbackURL)
     obj_ref = f"{_WORKSPACE_NAME}/{_OBJECT_NAME}"
     result = dfu.get_objects({'object_refs': [obj_ref]})['data'][0]
     obj_data = result['data']
     # crate user owned handle in the object and update it
     hs = HandleService(self.handleURL)
     prev_handle_id = obj_data['genbank_handle_ref']
     prev_shock_id = hs.hids_to_handles([prev_handle_id])[0]['id']
     new_handle_id = dfu.own_shock_node({
         'shock_id': prev_shock_id,
         'make_handle': 1
     })['handle']['hid']
     obj_data['genbank_handle_ref'] = new_handle_id
     # Save new object in test workspace
     obj_info = result['info']
     new_obj = {
         'type': obj_info[2],
         'data': obj_data,
         'name': 'GCF_002287175.1'
     }
     test_ws_id = dfu.ws_name_to_id(self.wsName)
     infos = dfu.save_objects({'id': test_ws_id, 'objects': [new_obj]})
     obj_ref = f"{infos[0][6]}/{infos[0][0]}/{infos[0][4]}"
     new_ws_id = infos[0][6]
     new_obj_id = infos[0][0]
     get_obj_params = {
         'wsid': new_ws_id,
         'objid': new_obj_id,
         'included': ['/taxon_assignments']
     }
     # Add a new assignment
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'taxon_assignments': {
                 taxon_key: taxon_val
             }
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val)
     # Update the assignment we just added
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'taxon_assignments': {
                 taxon_key: taxon_val_new
             }
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'][taxon_key], taxon_val_new)
     # Remove the assignment we just added
     self.serviceImpl.update_taxon_assignments(
         self.ctx, {
             'workspace_id': new_ws_id,
             'object_id': new_obj_id,
             'remove_assignments': [taxon_key]
         })
     # Fetch the object and check the mapping
     obj = self.wsClient.get_objects2({'objects':
                                       [get_obj_params]})['data'][0]['data']
     self.assertTrue(taxon_key not in obj['taxon_assignments'])
     self.assertEqual(obj['taxon_assignments'].get(taxon_key), None)
コード例 #26
0
class sample_uploader:
    '''
    Module Name:
    sample_uploader

    Module Description:
    A KBase module: sample_uploader
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.14"
    GIT_URL = "[email protected]:Tianhao-Gu/sample_uploader.git"
    GIT_COMMIT_HASH = "fddb10ca67368def8437569f8157b71b59f41e1c"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.workspace_url = config['workspace-url']
        self.scratch = config['scratch']
        # janky, but works for now
        self.sw_url = config.get('kbase-endpoint') + '/service_wizard'
        self.dfu = DataFileUtil(url=self.callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long, parameter
           "prevalidate" of Long, parameter "incl_input_in_output" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # Check if we have an existing Sample Set as input
        # if so, download
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set, errors = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], {}, [], [], [], sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        file_links = []
        sample_set_ref = None
        html_link = None

        if errors:
            # create UI to display the errors clearly
            html_link = _error_ui(errors, self.scratch)
        else:
            # only save object if there are no errors
            obj_info = self.dfu.save_objects({
                'id':
                save_ws_id,
                'objects': [{
                    "name": set_name,
                    "type": "KBaseSets.SampleSet",
                    "data": sample_set
                }]
            })[0]

            sample_set_ref = '/'.join(
                [str(obj_info[6]),
                 str(obj_info[0]),
                 str(obj_info[4])])
            sample_file_name = os.path.basename(
                params['sample_file']).split('.')[0] + '_OTU'

            # -- Format outputs below --
            # if output file format specified, add one to output
            if params.get('output_format') in ['csv', 'xls']:
                otu_path = sample_set_to_OTU_sheet(sample_set,
                                                   sample_file_name,
                                                   self.scratch, params)
                file_links.append({
                    'path':
                    otu_path,
                    'name':
                    os.path.basename(otu_path),
                    'label':
                    "OTU template file",
                    'description':
                    "file with each column containing the assigned sample_id and sample "
                    "name of each saved sample. Intended for uploading OTU data."
                })

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_data = {
            'report_object_name':
            "SampleSet_import_report_" + str(uuid.uuid4()),
            'workspace_name': params['workspace_name']
        }
        if file_links:
            report_data['file_links'] = file_links
        if sample_set_ref:
            report_data[
                'message'] = f"SampleSet object named \"{set_name}\" imported."
            report_data['objects_created'] = [{'ref': sample_set_ref}]

        if html_link:
            report_data['html_links'] = [{
                'path':
                html_link,
                'name':
                'index.html',
                'description':
                'Sample Set Import Error ui'
            }]
            report_data['direct_html_link_index'] = 0
        report_info = report_client.create_extended_report(report_data)
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref,
            'errors': errors
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def import_samples_from_IGSN(self, ctx, params):
        """
        :param params: instance of type "ImportSampleIGSNInputs" ->
           structure: parameter "sample_set_ref" of String, parameter "igsns"
           of list of String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "description" of String,
           parameter "set_name" of String, parameter "output_format" of
           String, parameter "taxonomy_source" of String, parameter
           "num_otus" of Long, parameter "incl_seq" of Long, parameter
           "otu_prefix" of String, parameter "share_within_workspace" of
           Long, parameter "prevalidate" of Long, parameter
           "incl_input_in_output" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples_from_IGSN

        igsns = params.get('igsns')
        if not igsns:
            raise ValueError('Please provide IGSNs')

        if isinstance(igsns, str):
            if igsns.isalnum():
                # single igsn given e.g. 'IEAWH0001'
                igsns = [igsns]
            else:
                # multiple igsn given e.g. 'IEAWH0001, GEE0000O4' or 'IEAWH0001; GEE0000O4'
                delimiter = csv.Sniffer().sniff(igsns).delimiter
                igsns = [x.strip() for x in igsns.split(delimiter)]

        logging.info('Start importing samples from IGSNs: {}'.format(igsns))

        sample_file_name = 'isgn_sample_{}.csv'.format(str(uuid.uuid4()))
        sample_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.makedirs(sample_file_dir)
        sample_file = os.path.join(sample_file_dir, sample_file_name)

        igsns_to_csv(igsns, sample_file)

        params['sample_file'] = sample_file
        params['file_format'] = 'SESAR'

        output = self.import_samples(ctx, params)[0]
        #END import_samples_from_IGSN

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples_from_IGSN return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def generate_OTU_sheet(self, ctx, params):
        """
        :param params: instance of type "GenerateOTUSheetParams" (Generate a
           customized OTU worksheet using a SampleSet input to generate the
           appropriate columns.) -> structure: parameter "workspace_name" of
           String, parameter "workspace_id" of Long, parameter
           "sample_set_ref" of String, parameter "output_name" of String,
           parameter "output_format" of String, parameter "num_otus" of Long,
           parameter "taxonomy_source" of String, parameter "incl_seq" of
           Long, parameter "otu_prefix" of String
        :returns: instance of type "GenerateOTUSheetOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN generate_OTU_sheet
        # first we download sampleset
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        if params.get('output_name'):
            output_name = params.get('output_name')
        else:
            # if output_name not specified use name of sample_set as output + "_OTUs"
            output_name = ret['info'][1] + "_OTUs"
        otu_path = sample_set_to_OTU_sheet(sample_set, output_name,
                                           self.scratch, params)
        report_client = KBaseReport(self.callback_url)
        report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'file_links': [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "CSV with headers for OTU",
                'description':
                "CSV file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }],
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
        }

        #END generate_OTU_sheet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method generate_OTU_sheet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def update_sample_set_acls(self, ctx, params):
        """
        :param params: instance of type "update_sample_set_acls_params" ->
           structure: parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "sample_set_ref" of String,
           parameter "new_users" of list of String, parameter "is_reader" of
           Long, parameter "is_writer" of Long, parameter "is_admin" of Long,
           parameter "share_within_workspace" of Long
        :returns: instance of type "update_sample_set_acls_output" ->
           structure: parameter "status" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN update_sample_set_acls

        # first get sample_set object
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_url = get_sample_service_url(self.sw_url)

        acls = {'read': [], 'write': [], 'admin': []}

        if params.get('share_within_workspace'):
            acls = get_workspace_user_perms(self.workspace_url,
                                            params.get('workspace_id'),
                                            ctx['token'], ctx['user_id'], acls)

        for new_user in params.get('new_users'):
            if params.get('is_admin'):
                acls['admin'].append(new_user)
            elif params.get('is_writer'):
                acls['write'].append(new_user)
            elif params.get('is_reader'):
                acls['read'].append(new_user)

        for sample in sample_set['samples']:
            sample_id = sample['id']
            status = update_acls(sample_url, sample_id, acls, ctx['token'])
        output = {"status": status}
        #END update_sample_set_acls

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method update_sample_set_acls return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_samples(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (export function for
           samples) -> structure: parameter "input_ref" of String, parameter
           "file_format" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_samples
        if not params.get('input_ref'):
            raise ValueError(f"variable input_ref required")
        sample_set_ref = params.get('input_ref')
        output_file_format = params.get('file_format', 'SESAR')

        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_set_name = ret['info'][1]
        sample_url = get_sample_service_url(self.sw_url)

        export_package_dir = os.path.join(self.scratch, "output")
        if not os.path.isdir(export_package_dir):
            os.mkdir(export_package_dir)
        output_file = os.path.join(export_package_dir,
                                   '_'.join(sample_set_name.split()) + ".csv")

        sample_set_to_output(sample_set, sample_url, ctx['token'], output_file,
                             output_file_format)

        # package it up
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {
            'shock_id': package_details['shock_id'],
            'result_dir': export_package_dir
        }
        #END export_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def link_reads(self, ctx, params):
        """
        :param params: instance of type "LinkReadsParams" -> structure:
           parameter "workspace_name" of String, parameter "workspace_id" of
           String, parameter "sample_set_ref" of String, parameter "links" of
           list of type "ReadsLink" (Create links between samples and reads
           objects.) -> structure: parameter "sample_name" of String,
           parameter "reads_ref" of String
        :returns: instance of type "LinkReadsOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "links" of list of unspecified object
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN link_reads
        logging.info(params)

        ss = SampleService(self.sw_url, service_ver='dev')

        sample_set_ref = params['sample_set_ref']
        sample_set_obj = self.dfu.get_objects(
            {'object_refs': [sample_set_ref]})['data'][0]['data']
        sample_name_2_info = {d['name']: d for d in sample_set_obj['samples']}

        links = [(d['sample_name'][0], d['reads_ref'])
                 for d in params['links']]

        new_data_links = []
        for sample_name, reads_ref in links:
            sample_id = sample_name_2_info[sample_name]['id']
            version = sample_name_2_info[sample_name]['version']
            sample = ss.get_sample({
                'id': sample_id,
                'version': version,
            })
            ret = ss.create_data_link(
                dict(
                    upa=reads_ref,
                    id=sample_id,
                    version=version,
                    node=sample['node_tree'][0]['id'],
                    update=1,
                ))
            new_data_links.append(ret)

        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'links': new_data_links,
        }
        #END link_reads

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method link_reads return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
コード例 #27
0
class FileUtil:
    def _validate_import_file_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        # check for required parameters
        for p in ['msa_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        return file_path, params['workspace_name'], params['msa_name']

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        logging.info('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {
            'file_path': file_path,
            'pack': 'gzip',
            'make_handle': True,
        }
        shock_id = self.dfu.file_to_shock(file_to_shock_params)['shock_id']

        return shock_id

    @staticmethod
    def _infer_seq_type(msa):
        dna_set = {"A", "C", "G", "T", "-"}
        seq_chars = {char for record in msa for char in record.seq}
        if seq_chars - dna_set:
            return "protein"
        else:
            return "dna"

    def _file_to_data(self, file_path, format='fasta'):
        """Do the file conversion"""

        data = {
            'alignment': {},
            'default_row_labels': {},
            'row_order': [],
        }

        msa = AlignIO.read(file_path, format)
        data['alignment_length'] = msa.get_alignment_length()
        data['sequence_type'] = self._infer_seq_type(msa)

        for record in msa:
            data['row_order'].append(record.id)
            data['default_row_labels'][record.id] = record.description
            data['alignment'][record.id] = str(record.seq)

        message = f'A Multiple Sequence Alignment with {len(data["alignment"])} sequences and ' \
                  f'an alignment length of {data["alignment_length"]} was produced'

        return data, message

    def _generate_report(self, msa_ref, workspace_name, message):
        """
        _generate_report: generate summary report for upload
        """
        report_params = {
            'message': message,
            'objects_created': [{
                'ref': msa_ref,
                'description': 'Imported MSA'
            }],
            'workspace_name': workspace_name,
            'report_object_name': f'import_msa_file_{uuid.uuid4()}'
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _get_object(self, params):
        ret = self.dfu.get_objects({'object_refs':
                                    [params['input_ref']]})['data'][0]
        obj_name = ret['info'][1]
        obj_data = ret['data']
        return obj_name, obj_data

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)

    def import_fasta_file(self, params):

        file_path, workspace_name, msa_name = self._validate_import_file_params(
            params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data, message = self._file_to_data(file_path,
                                           params.get('file_format', 'fasta'))
        data['description'] = params.get('description', '')

        info = self.dfu.save_objects({
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseTrees.MSA',
                'name': msa_name,
                'data': data
            }]
        })[0]
        obj_ref = f"{info[6]}/{info[0]}/{info[4]}"

        returnVal = {'msa_obj_ref': obj_ref}

        report_output = self._generate_report(obj_ref, workspace_name, message)

        returnVal.update(report_output)

        return returnVal

    def msa_to_file(self, params, file_type='fasta'):
        if "input_ref" not in params:
            raise ValueError("input_ref not in supplied params")
        if "destination_dir" not in params:
            raise ValueError("destination_dir not in supplied params")

        obj_name, obj_data = self._get_object(params)
        keys = obj_data.get('row_order', obj_data['alignment'].keys)
        row_labels = obj_data.get('default_row_labels', {})
        file_path = os.path.join(self.scratch, f'{obj_name}.{file_type}')
        seq_type = generic_protein if obj_data.get(
            'sequence_type') == "protein" else generic_dna

        msa = MultipleSeqAlignment([
            SeqRecord(Seq(obj_data['alignment'][key], seq_type),
                      id=key,
                      description=row_labels[key]) for key in keys
        ])
        AlignIO.write(msa, file_path, file_type)

        return {'file_path': file_path}

    def msa_to_clustal_file(self, params):
        raise NotImplementedError

    def export_file(self, params, file_type='fasta'):
        params['destination_dir'] = os.path.join(self.scratch,
                                                 str(uuid.uuid4()))
        os.mkdir(params['destination_dir'])

        file_path = self.msa_to_file(params, file_type)['file_path']

        return {'shock_id': self._upload_to_shock(file_path)}
コード例 #28
0
ファイル: DataUtil.py プロジェクト: man4ish/GenericsAPI
class DataUtil:
    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')
        constraints = {}

        for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'):
            constraints[tag] = [
                line.strip().split()[1:] for line in type_desc.split("\n")
                if line.startswith(f'@{tag}')
            ]

        return constraints

    def _filter_constraints(self, constraints, data):
        """filters out constraints with missing keys"""
        contains_constraints = constraints.get('contains')
        # exit(contains_constraints)  [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']]

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint[1:]
            #exit(in_values)  ['row_mapping']
            missing_key = True
            for in_value in in_values:
                # exit(in_value)  row_mapping
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                             in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)
        #exit(constraints)
        '''
        {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]}
        '''
        return constraints

    def _retrieve_value(self, data, value):
        """Parse the provided 'data' object to retrieve the item in 'value'."""
        logging.info('Getting value for {}'.format(value))
        retrieve_data = []
        #exit(data)
        '''
        {'row_attributemapping_ref': '44071/19/157', 'row_mapping': {'GG_OTU_1': 'GG_OTU_1', 'GG_OTU_2': 'GG_OTU_2', 'GG_OTU_3': 'GG_OTU_3', 'GG_OTU_4': 'GG_OTU_4', 'GG_OTU_5': 'GG_OTU_5'}, 'col_attributemapping_ref': '44071/20/79', 'col_mapping': {'Sample1': 'Sample1', 'Sample2': 'Sample2', 'Sample3': 'Sample3', 'Sample4': 'Sample4', 'Sample5': 'Sample5', 'Sample6': 'Sample6'}, 'attributes': {'generated_by': 'QIIME revision XYZ'}, 'data': {'row_ids': ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], 'col_ids': ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], 'values': [[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]}, 'search_attributes': ['generated_by|QIIME revision XYZ'], 'scale': 'raw', 'description': 'OTU data'}
        '''
        m_data = DotMap(data)
        #exit(m_data)
        '''
        DotMap(row_attributemapping_ref='44071/19/158', row_mapping=DotMap(GG_OTU_1='GG_OTU_1', GG_OTU_2='GG_OTU_2', GG_OTU_3='GG_OTU_3', GG_OTU_4='GG_OTU_4', GG_OTU_5='GG_OTU_5'), col_attributemapping_ref='44071/20/80', col_mapping=DotMap(Sample1='Sample1', Sample2='Sample2', Sample3='Sample3', Sample4='Sample4', Sample5='Sample5', Sample6='Sample6'), attributes=DotMap(generated_by='QIIME revision XYZ'), data=DotMap(row_ids=['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5'], col_ids=['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5', 'Sample6'], values=[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [5.0, 1.0, 0.0, 2.0, 3.0, 1.0], [0.0, 0.0, 1.0, 4.0, 2.0, 0.0], [2.0, 1.0, 1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0, 0.0, 0.0]]), search_attributes=['generated_by|QIIME revision XYZ'], scale='raw', description='OTU data')
        '''
        #exit(value) data.row_ids
        if value.startswith('set('):
            retrieve_data = value[4:-1].split(",")
        elif value.startswith(
                'values('):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                     value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()

            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp.values())
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2(
                    {'objects': [{
                        'ref': obj_ref,
                        'included': [included]
                    }]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [
                            x.get(keys[2]) for x in ref_data.get(keys[0])
                        ]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        logging.info('Retrieved value (first 20):\n{}\n'.format(
            retrieve_data[:20]))
        #exit(retrieve_data)   ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']

        return retrieve_data

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """
        #exit(constraints)
        '''
        {'contains': [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']], 'rowsum': [], 'unique': [['data.row_ids'], ['data.col_ids']], 'conditionally_required': [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]}
        '''
        validated = True
        failed_constraints = defaultdict(list)

        unique_constraints = constraints.get('unique')
        #exit(unique_constraints)  [['data.row_ids'], ['data.col_ids']]
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint[0])
            #exit(retrieved_value)  ['GG_OTU_1', 'GG_OTU_2', 'GG_OTU_3', 'GG_OTU_4', 'GG_OTU_5']
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint[0])

        contains_constraints = constraints.get('contains')
        #exit(contains_constraints) [['data.row_ids', 'row_mapping'], ['data.col_ids', 'col_mapping'], ['values(row_mapping)', 'row_attributemapping_ref:instances'], ['values(col_mapping)', 'col_attributemapping_ref:instances']]
        for contains_constraint in contains_constraints:
            value = contains_constraint[0]
            in_values = contains_constraint[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <=
                    set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(
                    " ".join(contains_constraint))

        conditional_constraints = constraints.get('conditionally_required')
        #exit(conditional_constraints)  [['row_attributemapping_ref', 'row_mapping'], ['col_attributemapping_ref', 'col_mapping']]
        for conditional_constraint in conditional_constraints:
            trigger = conditional_constraint[0]
            required_keys = conditional_constraint[1:]
            if trigger in data:
                missing_keys = [
                    key for key in required_keys if key not in data
                ]
                if missing_keys:
                    validated = False
                    failed_constraints['conditionally_required'].append(
                        (trigger, required_keys, missing_keys))

        return validated, failed_constraints

    @staticmethod
    def _raise_validation_error(params, validate):
        """Raise a meaningful error message for failed validation"""
        logging.error('Data failed type checking')
        failed_constraints = validate.get('failed_constraints')
        error_msg = [
            'Object {} failed type checking:'.format(params.get('obj_name'))
        ]
        if failed_constraints.get('unique'):
            unique_values = failed_constraints.get('unique')
            error_msg.append(
                'Object should have unique field: {}'.format(unique_values))
        if failed_constraints.get('contains'):
            contained_values = failed_constraints.get('contains')
            for contained_value in contained_values:
                subset_value = contained_value.split(' ')[0]
                super_value = ' '.join(contained_value.split(' ')[1:])
                if 'col_mapping' in super_value:
                    error_msg.append(
                        'Column attribute mapping instances should contain all '
                        'column index from original data')

                if 'row_mapping' in super_value:
                    error_msg.append(
                        'Row attribute mapping instances should contain all row '
                        'index from original data')

                error_msg.append(
                    'Object field [{}] should contain field [{}]'.format(
                        super_value, subset_value))
        for failure in failed_constraints.get('conditionally_required', []):
            error_msg.append(
                'If object field "{}" is present than object field(s) {} should '
                'also be present. Object is missing {}'.format(*failure))

        raise ValueError('\n'.join(error_msg))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.serviceWizardURL = config['srv-wiz-url']
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.generics_service = GenericsService(self.serviceWizardURL)

    def list_generic_types(self, params=None):
        """
        *Not yet exposed in spec*
        list_generic_types: lists the current valid generics types

        arguments:
            none

        return:
            A list of generic types in the current environment
        """
        returnVal = [
            x['type_def'] for module in GENERICS_MODULES
            for x in self.wsClient.get_all_type_info(module)
        ]
        return returnVal

    def fetch_data(self, params):
        #exit(params) {'obj_ref': '44071/21/241'}
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))
        #exit(self.generics_service.fetch_data(params))  {'data_matrix': '{"Sample1":{"GG_OTU_1":0.0,"GG_OTU_2":5.0,"GG_OTU_3":0.0,"GG_OTU_4":2.0,"GG_OTU_5":0.0},"Sample2":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample3":{"GG_OTU_1":1.0,"GG_OTU_2":0.0,"GG_OTU_3":1.0,"GG_OTU_4":1.0,"GG_OTU_5":1.0},"Sample4":{"GG_OTU_1":0.0,"GG_OTU_2":2.0,"GG_OTU_3":4.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample5":{"GG_OTU_1":0.0,"GG_OTU_2":3.0,"GG_OTU_3":2.0,"GG_OTU_4":0.0,"GG_OTU_5":0.0},"Sample6":{"GG_OTU_1":0.0,"GG_OTU_2":1.0,"GG_OTU_3":0.0,"GG_OTU_4":1.0,"GG_OTU_5":0.0}}'}
        return self.generics_service.fetch_data(params)

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        return {
            'validated': validated,
            'failed_constraints': failed_constraints
        }

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        logging.info('Starting saving object')

        obj_type = params.get('obj_type')

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({
            'mod': module_name
        }).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').items() if v)
        validate = self.validate_data({'obj_type': obj_type, 'data': data})

        if not validate.get('validated'):
            self._raise_validation_error(params, validate)

        workspace_name = params.get('workspace_name')
        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": data,
                "name": params.get('obj_name')
            }]
        })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
コード例 #29
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
コード例 #30
0
class GFFUtils2:
    def __init__(self, config):
        self.callback_url = config['callback_url']
        self.shared_folder = config['scratch']
        #self.shared_folder = "/kb/module/work"
        self.ws_url = config['workspace-url']

        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)

    def _prep_gff(self, gff_file):
        outfile = os.path.join(self.genome_dir, 'out.gff')
        sortcmd = f'(grep ^"#"  {gff_file}; grep -v ^"#" {gff_file} | sort -k1,1 -k4,4n)'

        with open(outfile, 'w') as o:
            p = subprocess.Popen(sortcmd, shell=True, stdout=o)
            out, err = p.communicate()
            o.close()

        bgzip = subprocess.Popen(['bgzip', 'out.gff'], cwd=self.genome_dir)
        out2, err2 = bgzip.communicate()

        outfile += '.gz'

        return outfile

    def _construct_gff_from_json(self, json, gff_file_path, contig_base_lengths):
        with open(gff_file_path, 'w') as f:
            for feature in json:
                if feature['feature_type'].strip().upper() == 'GENE':
                    end = int(feature['location'][0]['start'])+int(feature['location'][0]['length'])

                    metainfo = "ID="+feature['feature_id']

                    if feature['function']:
                        metainfo += ';FUNCTION='+feature['function']

                    contig_id = str(feature['location'][0]['contig_id'])
                    start = int(feature['location'][0]['start'])

                    # TODO: Fix Plink reassignment of Chr prefixes
                    try:
                        global_pos = int(contig_base_lengths[contig_id]) + start
                    except KeyError:
                        try:
                            global_pos = int(contig_base_lengths[contig_id.capitalize()]) + start
                        except KeyError:
                            try:
                                global_pos = int(contig_base_lengths['Chr'+str(contig_id)]) + start
                            except KeyError:
                                try:
                                    global_pos = int(contig_base_lengths['Chr0'+str(contig_id)]) + start
                                except KeyError:
                                    pp(contig_base_lengths)
                                    pp(contig_id)
                                    raise KeyError(e)

                    """
                    Remove ontology for now
                    if feature['ontology_terms']:
                        metainfo += ';ONTOLOGY('

                        for k, v in feature['ontology_terms'].items():
                            metainfo += str(k) + ',' + str(v) + ':'

                        metainfo = metainfo[:-1]  # remove trailing ;
                        metainfo += ')'
                    """

                    constructed_gff_line = str(feature['location'][0]['contig_id']) + '\t' + \
                                           'KBase\tgene\t' + \
                                           str(feature['location'][0]['start']) + '\t' + \
                                           str(end) + '\t.\t' + \
                                           str(feature['location'][0]['strand']) + '\t' + \
                                           str(global_pos) + '\t' + \
                                           str(metainfo) + '\n'
                    f.write(constructed_gff_line)
            f.close()
        if os.path.exists(gff_file_path):
            return gff_file_path
        else:
            raise FileNotFoundError('Unable to create GFF file form genome JSON.')

    def _process_tabix_results(self, queryresult):
        queryinfo = queryresult[8].split(';')
        if len(queryinfo) >= 2:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", clean_tsv_data(queryinfo[1][9:])]
        elif len(queryinfo) is 1:
            extension = [clean_tsv_data(queryinfo[0][3:]), "NA", "NA"]
        else:
            extension = ['NA', 'NA', 'NA']
        return extension

    def find_gene_info(self, row):
        tb = tabix_query(self.sorted_gff, row["CHR"], int(row["POS"]), int(row["POS"]))
        tbresult = next(tb, None)
        if tbresult is None:
            tb2 = tabix_query(self.sorted_gff, 'chr' + row["CHR"], int(row["POS"]), int(row["POS"]))
            tbresult2 = next(tb2, None)
            if tbresult2 is None:
                tb3 = tabix_query(self.sorted_gff, 'chr0' + row["CHR"], int(row["POS"]), int(row["POS"]))
                tbresult3 = next(tb3, None)
                if tbresult3 is None:
                    if int(row["POS"]) < 500:
                        nstart = 0
                    else:
                        nstart = int(row["POS"]) - 500

                    neigh_tb = tabix_query(self.sorted_gff, row["CHR"], nstart, int(row["POS"]) + 500)
                    neigh_result = next(neigh_tb, None)

                    if neigh_result is None:
                        return pd.Series(['NA', 'NA', 'NA'], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                    else:
                        nq = self._process_tabix_results(neigh_result)
                        return pd.Series([nq[1], nq[0], nq[2]], index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
                else:
                    q3 = self._process_tabix_results(tbresult3)
                    return pd.Series(q3, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
            else:
                q2 = self._process_tabix_results(tbresult2)
                return pd.Series(q2, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])
        else:
            q = self._process_tabix_results(tbresult)
            return pd.Series(q, index=['GENEID', 'NEIGHBORGENE', 'FUNCTION'])

    def get_gwas_result_file(self, association_ref, association_name, p_value):
        #association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]['data']['data']
        association_obj = self.dfu.get_objects({'object_refs': [association_ref]})['data'][0]
        association_results = association_obj['data']["association_details"][0]["association_results"]
        result = "CHR\tSNP\tPOS\tP\tBP\n"
        for variation in association_results:
            if (float(variation[3]) > float(p_value)):
                continue
            result += str(variation[0]) + "\t" 
            result +=  str(variation[1]) + "\t" 
            result +=  str(variation[2]) + "\t" 
            result +=   str(variation[3]) + "\t"
            result +=   str(variation[2]) + "\n"
        filepath = os.path.join(self.genome_dir, association_name)
        with open(filepath, "w") as file1: 
            file1.write(result) 
        return (filepath)

    def build_featureset(self, filepath, genome_ref, description, workspace_name, association_name, prefix):
      gene_ids = dict()
      element_ordering = list()
      elements = dict()
      skip_words = ["GENEID", "NEIGHBORGENE", "NA"]
      with open(filepath, 'r') as reader:
          for line in reader:
              fields = line.split("\t")
              condition1 = fields[5] not in skip_words
              condition2 = fields[5] not in elements
              condition3 = fields[6] not in skip_words
              condition4 = fields[6] not in elements
              if condition1 and condition2:
                  element_ordering.append(fields[5])
                  elements[fields[5]] = [genome_ref]
              if condition3 and condition4:
                  element_ordering.append(fields[6])
                  elements[fields[6]] = [genome_ref]
      featureset = dict()
      featureset['description'] = description
      featureset['element_ordering'] = element_ordering
      featureset['elements'] = elements
      ws_id = self.dfu.ws_name_to_id(workspace_name)
      featureset_obj_name = prefix + str(association_name)

      save_info = self.dfu.save_objects( { 'id': ws_id, 
                                            'objects': [ {'type': 'KBaseCollections.FeatureSet', 
                                                          'data': featureset, 
                                                          'name': featureset_obj_name}]})[0]
      obj_ref  = "{0}/{1}/{2}".format( save_info[6], save_info[0], save_info[4] )   
      return obj_ref         


   
    def annotate_GWAS_results(self, genome_ref, association_ref, workspace_name, prefix, p_value):
         
        #TODO: Send outfile to prep gff function inseted of hardcord
        #TODO: Removed hard coded stuff and create new directory for each test function
        self.genome_dir_name = "_".join(genome_ref.split("/"))
        self.genome_dir = os.path.join(self.shared_folder, self.genome_dir_name)
        if not os.path.isdir(self.genome_dir):
            os.mkdir(self.genome_dir)
        sorted_gff_path = os.path.join(self.genome_dir, 'out.gff.gz')
        self.sorted_gff = sorted_gff_path

        if  not os.path.exists(sorted_gff_path):
            feature_num = self.gsu.search({'ref': genome_ref})['num_found']
            # get genome features for gff construction
            genome_features = self.gsu.search({
                'ref': genome_ref,
                'limit': feature_num,
                #'sort_by': [['feature_id', True]]
            })['features']

            assembly_ref = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': genome_ref
            }])[0]['data']['assembly_ref']

            # get assembly contigs for base length calculations
            assembly_contigs = self.wsc.get_object_subset([{
                'included': ['/contigs'],
                'ref': assembly_ref
            }])[0]['data']['contigs']

            contig_ids = list(assembly_contigs.keys())
            contig_ids.sort()

            contig_base_lengths = {}
            prev_length = 0

            for contig in contig_ids:
                contig_base_lengths[contig] = prev_length
                prev_length += assembly_contigs[contig]['length']

            gff_file = os.path.join(self.genome_dir, 'constructed.gff')
            constructed_gff = self._construct_gff_from_json(genome_features, gff_file, contig_base_lengths)
            self.sorted_gff = self._prep_gff(constructed_gff)
            tabix_index(self.sorted_gff)

        obj_info = self.wsc.get_object_info3({"objects": [{"ref": association_ref}]})
        association_name =obj_info["infos"][0][1]


        gwas_results_file = self.get_gwas_result_file(association_ref, association_name, p_value)

        gwas_results = pd.read_csv(gwas_results_file, sep='\t')

        gwas_results[['GENEID', 'NEIGHBORGENE', 'FUNCTION']] = \
           gwas_results.apply(self.find_gene_info, axis=1)

        new_results_path = os.path.abspath(os.path.join(gwas_results_file, '..'))
        fname = 'final_' +  association_name
        new_results_path = os.path.join(new_results_path, fname )
        gwas_results.to_csv(path_or_buf=new_results_path, sep='\t', index=False)
        description = "Genelist for GWAS results of trait " + association_name
         
        featureset_obj = self.build_featureset( new_results_path, genome_ref, description, workspace_name, association_name, prefix)
        
        return featureset_obj
コード例 #31
0
class sample_uploader:
    '''
    Module Name:
    sample_uploader

    Module Description:
    A KBase module: sample_uploader
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.12"
    GIT_URL = "https://github.com/kbaseapps/sample_uploader"
    GIT_COMMIT_HASH = "5134b679279c84128b0ca5b684fa75dacf7dba59"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.workspace_url = config['workspace-url']
        self.scratch = config['scratch']
        # janky, but works for now
        self.sw_url = config.get('kbase-endpoint') + '/service_wizard'
        self.dfu = DataFileUtil(url=self.callback_url)
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        #END_CONSTRUCTOR
        pass

    def import_samples(self, ctx, params):
        """
        :param params: instance of type "ImportSampleInputs" -> structure:
           parameter "sample_set_ref" of String, parameter "sample_file" of
           String, parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "file_format" of String,
           parameter "description" of String, parameter "set_name" of String,
           parameter "header_row_index" of Long, parameter "id_field" of
           String, parameter "output_format" of String, parameter
           "taxonomy_source" of String, parameter "num_otus" of Long,
           parameter "incl_seq" of Long, parameter "otu_prefix" of String,
           parameter "share_within_workspace" of Long
        :returns: instance of type "ImportSampleOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "sample_set" of type "SampleSet" -> structure:
           parameter "samples" of list of type "sample_info" -> structure:
           parameter "id" of type "sample_id", parameter "name" of String,
           parameter "description" of String, parameter "sample_set_ref" of
           String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN import_samples
        print(f"Beginning sample import with following parameters:")
        print(f"params -- {params}")
        sample_set = {"samples": []}
        # We subtract by 1 for zero indexing.
        if params.get('sample_set_ref'):
            ret = self.dfu.get_objects(
                {'object_refs': [params['sample_set_ref']]})['data'][0]
            sample_set = ret['data']
            set_name = ret['info'][1]
            save_ws_id = params['sample_set_ref'].split('/')[0]
        else:
            if not params.get('set_name'):
                raise ValueError(
                    f"Sample set name required, when new SampleSet object is created."
                )
            set_name = params['set_name']
            save_ws_id = params.get('workspace_id')
        if params.get('header_row_index'):
            header_row_index = int(params["header_row_index"]) - 1
        else:
            header_row_index = 0
            if params.get('file_format') == "SESAR":
                header_row_index = 1

        username = ctx['user_id']

        if params.get('file_format') == 'ENIGMA':
            # ENIGMA_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in ENIGMA_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], ENIGMA_mappings['column_mapping'],
                ENIGMA_mappings.get('groups',
                                    []), ENIGMA_mappings['date_columns'],
                ENIGMA_mappings.get('column_unit_regex',
                                    []), sample_set, header_row_index)
        elif params.get('file_format') == 'SESAR':
            # SESAR_mappings['verification_mapping'].update(
            #     {key: ("is_string", []) for key in SESAR_mappings['basic_columns']}
            # )
            sample_set = import_samples_from_file(
                params, self.sw_url, self.workspace_url, username,
                ctx['token'], SESAR_mappings['column_mapping'],
                SESAR_mappings.get('groups',
                                   []), SESAR_mappings['date_columns'],
                SESAR_mappings.get('column_unit_regex',
                                   []), sample_set, header_row_index)
        elif params.get('file_format') == 'KBASE':
            sample_set = import_samples_from_file(params, self.sw_url,
                                                  self.workspace_url, username,
                                                  ctx['token'], {}, [], [], [],
                                                  sample_set, header_row_index)
        else:
            raise ValueError(
                f"Only SESAR and ENIGMA formats are currently supported for importing samples. "
                "File of format {params.get('file_format')} not supported.")

        obj_info = self.dfu.save_objects({
            'id':
            save_ws_id,
            'objects': [{
                "name": set_name,
                "type": "KBaseSets.SampleSet",
                "data": sample_set
            }]
        })[0]

        sample_set_ref = '/'.join(
            [str(obj_info[6]),
             str(obj_info[0]),
             str(obj_info[4])])
        sample_file_name = os.path.basename(
            params['sample_file']).split('.')[0] + '_OTU'

        # -- Format outputs below --
        # if output file format specified, add one to output
        if params.get('output_format') in ['csv', 'xls']:
            otu_path = sample_set_to_OTU_sheet(sample_set, sample_file_name,
                                               self.scratch, params)
            file_links = [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "OTU template file",
                'description':
                "file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }]
        else:
            file_links = []

        if params.get('incl_input_in_output'):
            sample_file = params.get('sample_file')
            if not os.path.isfile(sample_file):
                # try prepending '/staging/' to file and check then
                if os.path.isfile(os.path.join('/staging', sample_file)):
                    sample_file = os.path.join('/staging', sample_file)
                else:
                    raise ValueError(
                        f"input file {sample_file} does not exist.")
            sample_file_copy = os.path.join(self.scratch,
                                            os.path.basename(sample_file))
            shutil.copy(sample_file, sample_file_copy)
            file_links.append({
                "path":
                sample_file_copy,
                "name":
                os.path.basename(sample_file_copy),
                "label":
                "Input Sample file",
                "description":
                "Input file provided to create the sample set."
            })

        # create report
        report_client = KBaseReport(self.callback_url)
        report_name = "SampleSet_import_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'message':
            f"SampleSet object named \"{set_name}\" imported.",
            'objects_created': [{
                'ref': sample_set_ref
            }],
            'file_links':
            file_links,
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
            'sample_set': sample_set,
            'sample_set_ref': sample_set_ref
        }
        #END import_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method import_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def generate_OTU_sheet(self, ctx, params):
        """
        :param params: instance of type "GenerateOTUSheetParams" (Generate a
           customized OTU worksheet using a SampleSet input to generate the
           appropriate columns.) -> structure: parameter "workspace_name" of
           String, parameter "workspace_id" of Long, parameter
           "sample_set_ref" of String, parameter "output_name" of String,
           parameter "output_format" of String, parameter "num_otus" of Long,
           parameter "taxonomy_source" of String, parameter "incl_seq" of
           Long, parameter "otu_prefix" of String
        :returns: instance of type "GenerateOTUSheetOutputs" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN generate_OTU_sheet
        # first we download sampleset
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        if params.get('output_name'):
            output_name = params.get('output_name')
        else:
            # if output_name not specified use name of sample_set as output + "_OTUs"
            output_name = ret['info'][1] + "_OTUs"
        otu_path = sample_set_to_OTU_sheet(sample_set, output_name,
                                           self.scratch, params)
        report_client = KBaseReport(self.callback_url)
        report_name = "Generate_OTU_sheet_report_" + str(uuid.uuid4())
        report_info = report_client.create_extended_report({
            'file_links': [{
                'path':
                otu_path,
                'name':
                os.path.basename(otu_path),
                'label':
                "CSV with headers for OTU",
                'description':
                "CSV file with each column containing the assigned sample_id and sample "
                "name of each saved sample. Intended for uploading OTU data."
            }],
            'report_object_name':
            report_name,
            'workspace_name':
            params['workspace_name']
        })
        output = {
            'report_ref': report_info['ref'],
            'report_name': report_info['name'],
        }

        #END generate_OTU_sheet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method generate_OTU_sheet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def update_sample_set_acls(self, ctx, params):
        """
        :param params: instance of type "update_sample_set_acls_params" ->
           structure: parameter "workspace_name" of String, parameter
           "workspace_id" of Long, parameter "sample_set_ref" of String,
           parameter "new_users" of list of String, parameter "is_reader" of
           Long, parameter "is_writer" of Long, parameter "is_admin" of Long,
           parameter "share_within_workspace" of Long
        :returns: instance of type "update_sample_set_acls_output" ->
           structure: parameter "status" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN update_sample_set_acls

        # first get sample_set object
        sample_set_ref = params.get('sample_set_ref')
        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_url = get_sample_service_url(self.sw_url)

        acls = {'read': [], 'write': [], 'admin': []}

        if params.get('share_within_workspace'):
            acls = get_workspace_user_perms(self.workspace_url,
                                            params.get('workspace_id'),
                                            ctx['token'], ctx['user_id'], acls)

        for new_user in params.get('new_users'):
            if params.get('is_admin'):
                acls['admin'].append(new_user)
            elif params.get('is_writer'):
                acls['write'].append(new_user)
            elif params.get('is_reader'):
                acls['read'].append(new_user)

        for sample in sample_set['samples']:
            sample_id = sample['id']
            status = update_acls(sample_url, sample_id, acls, ctx['token'])
        output = {"status": status}
        #END update_sample_set_acls

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method update_sample_set_acls return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def export_samples(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (export function for
           samples) -> structure: parameter "input_ref" of String, parameter
           "file_format" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_samples
        if not params.get('input_ref'):
            raise ValueError(f"variable input_ref required")
        sample_set_ref = params.get('input_ref')
        output_file_format = params.get('file_format', 'SESAR')

        ret = self.dfu.get_objects({'object_refs':
                                    [sample_set_ref]})['data'][0]
        sample_set = ret['data']
        sample_set_name = ret['info'][1]
        sample_url = get_sample_service_url(self.sw_url)

        export_package_dir = os.path.join(self.scratch, "output")
        if not os.path.isdir(export_package_dir):
            os.mkdir(export_package_dir)
        output_file = os.path.join(export_package_dir,
                                   '_'.join(sample_set_name.split()) + ".csv")

        sample_set_to_output(sample_set, sample_url, ctx['token'], output_file,
                             output_file_format)

        # package it up
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {
            'shock_id': package_details['shock_id'],
            'result_dir': export_package_dir
        }
        #END export_samples

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_samples return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def link_reads(self, ctx, params):
        """
        Create links between samples and reads objects
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN link_reads
        ss = SampleService(self.sw_url, token=ctx['token'], service_ver='beta')
        sample_set_ref = params['sample_set_ref']
        sample_set = SampleSet(self.dfu, sample_set_ref)
        links = [(d['sample_name'], d['reads_ref']) for d in params['links']]

        for sample_name, reads_ref in links:
            node_id, version, sample_id = sample_set.get_sample_info(
                sample_name)
            p = dict(
                upa=reads_ref,
                id=sample_id,
                version=version,
                node=node_id,
                update=1,
            )
            ret = ss.create_data_link(p)

        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
        })
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
        }
        #END link_reads

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method link_reads return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]