def get_ws_admins(ws_id, ws_url, admin_token):
    ws = Workspace(url=ws_url, token=admin_token)
    perms = ws.administer({
        "command": "getPermissionsMass",
        "params": {
            "workspaces": [{
                "id": ws_id
            }]
        }
    })['perms'][0]

    admins = list()
    for u in perms:
        if perms[u] == "a":
            admins.append(u)
    return admins
Exemple #2
0
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.shared_folder = config['scratch']
     self.du = DownloadUtils(self.callback_url)
     self.su = SimUtils()
     self.ru = ReadsUtils(self.callback_url)
     self.vu = VariationUtil(self.callback_url)
     self.eu = VcfEvalUtils()
     self.hu = htmlreportutils()
     self.ws_url = config['workspace-url']
     self.wsc = Workspace(self.ws_url)
     logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                         level=logging.INFO)
     #END_CONSTRUCTOR
     pass
    def __init__(self, config, varfiles):
        self.dfu = DataFileUtil(config['SDK_CALLBACK_URL'])

        # TODO: input variable for workspace url
        self.wsc = Workspace("https://appdev.kbase.us/services/ws")
        self.scratch = config["scratch"]
        self._process_varfiles(varfiles)
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('MotifFindermfmd'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'user_id':
         user_id,
         'provenance': [{
             'service': 'MotifFindermfmd',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = Workspace(cls.wsURL)
     cls.serviceImpl = MotifFindermfmd(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
Exemple #5
0
    def get_annotated_metagenome_assembly(self, ctx, params):
        """
        :param params: instance of type
           "getAnnotatedMetagenomeAssemblyParams" (ref - workspace reference
           to AnnotatedMetagenomeAssembly Object included_fields - The fields
           to include from the Object included_feature_fields -) ->
           structure: parameter "ref" of String, parameter "included_fields"
           of list of String, parameter "included_feature_fields" of list of
           String
        :returns: instance of type "getAnnotatedMetagenomeAssemblyOutput" ->
           structure:
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_annotated_metagenome_assembly
        ws = Workspace(self.config['workspace-url'], token=ctx['token'])
        ama_utils = AMAUtils(ws)
        output = ama_utils.get_annotated_metagenome_assembly(params)

        #END get_annotated_metagenome_assembly

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method get_annotated_metagenome_assembly return value ' +
                'output is not type dict as required.')
        # return the results
        return [output]
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'provenance': [
                         {'service': 'GenomeFileUtil',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)  # type: ignore
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.ws = Workspace(cls.wsURL, token=token)
     cls.gfu = GenomeFileUtil(cls.cfg)
     # create one WS for all tests
     suffix = int(time.time() * 1000)
     cls.ws_name = "test_GenomeAnnotationAPI_" + str(suffix)
     ws_info = cls.ws.create_workspace({'workspace': cls.ws_name})
     cls.ws_id = ws_info[0]
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('kb_model_analysis'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'kb_model_analysis',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = Workspace(cls.wsURL)
     cls.serviceImpl = kb_model_analysis(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     suffix = int(time.time() * 1000)
     cls.wsName = "test_ContigFilter_" + str(suffix)
     ret = cls.wsClient.create_workspace({'workspace': cls.wsName})  # noqa
     cls.wsId = ret[0]
Exemple #8
0
def get_object_names(ref_list, ws_url):
    """
    From a list of workspace references, returns a mapping from ref -> name of the object.
    """
    ws = Workspace(ws_url)
    obj_ids = list()
    for ref in ref_list:
        obj_ids.append({"ref": ref})
    info = ws.get_object_info3({"objects": obj_ids})
    name_map = dict()
    # might be in a data palette, so we can't just use the ref.
    # we already have the refs as passed previously, so use those for mapping, as they're in
    # the same order as what's returned.
    for i in range(len(info["infos"])):
        name_map[ref_list[i]] = info["infos"][i][1]
    return name_map
Exemple #9
0
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)
def read_narrative(ref: NarrativeRef, ws_client: Workspace) -> Dict:
    """
    Fetches a Narrative and its object info from the Workspace
    If content is False, this only returns the Narrative's info
    and metadata, otherwise, it returns the whole workspace object.

    This is mainly a wrapper around Workspace.get_objects2(), except that
    it always returns a dict. If content is False, it returns a dict
    containing a single key: 'info', with the object info and, optionally,
    metadata.

    Can the following errors:
        ValueError (if ref isn't a Narrative object),
        WorkspaceError if there's a Workspace issue (ref isn't valid, or token isn't valid)

    :param ref: a NarrativeRef
    :param content: if True, returns the narrative document, otherwise just the metadata
    :param include_metadata: if True, includes the object metadata when returning
    """
    try:
        narr_data = ws_client.get_objects2({'objects': [{'ref': str(ref)}]})
        nar = narr_data['data'][0]
        _validate_narr_type(nar['info'][2], ref)
        # nar['data'] = update_narrative(nar['data'])
        return nar['data']
    except ServerError as err:
        raise WorkspaceError(err, ref.wsid)
 def extract_dna_sequences(self, token, params):
     """Takes an assembly/contig set ref and one or more locations and returns the DNA sequence
     from the assembly at that location while caching the assembly for efficiency"""
     if not params.get('ref'):
         raise ValueError("'ref', a reference to an assembly must be provided")
     ref = params['ref']
     locs = params.get('locations', [])
     ws = Workspace(self.ws_url, token=token)
     # This is also a cheap way to ensure that the object exists and that the user has access
     obj_type = ws.get_object_info3({'objects': [{'ref': ref}]})['infos'][0][2]
     if obj_type.split('-')[0] not in self.valid_types:
         raise ValueError(f'{obj_type} is not a valid input type for this function')
     assembly_dir = os.path.join(self.cache_dir, ref.replace('/', ':'))
     if not os.path.exists(assembly_dir):
         self._cache_assembly(ws, token, ref, assembly_dir)
     return [_extract_sequence(assembly_dir, l) for l in locs]
    def search_orthologs_from_pangenome(self, token, ref, query, sort_by,
                                        start, limit, num_found):

        search_object = 'orthologs'
        info_included = [
            'id', 'type', 'function', 'md5', 'protein_translation', 'orthologs'
        ]
        table_indexer = TableIndexer(token, self.ws_url)

        ret = table_indexer.run_search(ref, self.pangenome_index_dir,
                                       self.ORTHOLOGS_SUFFIX, search_object,
                                       info_included, query, sort_by, start,
                                       limit, num_found, self.debug)

        for orthologs in ret['orthologs']:
            orthologs_string = orthologs['orthologs']
            if orthologs_string:
                orthologs['orthologs'] = list(eval(orthologs_string))
                if not isinstance(orthologs['orthologs'][0], list):
                    orthologs['orthologs'] = [orthologs['orthologs']]

        ws = Workspace(self.ws_url, token=token)
        genome_feature_function_map = {}
        for orthologs in ret['orthologs']:
            for orthologs_obj in orthologs['orthologs']:
                gene_id = orthologs_obj[0]

                if gene_id in genome_feature_function_map:
                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))
                else:
                    included = ["/features/[*]/function", "/features/[*]/id"]
                    object_info = ws.get_objects2({
                        'objects': [{
                            'ref': orthologs_obj[2],
                            'included': included
                        }]
                    })['data'][0]['data']

                    for feature in object_info['features']:
                        genome_feature_function_map.update(
                            {feature.get('id'): feature.get('function')})

                    orthologs_obj.append(
                        genome_feature_function_map.get(gene_id))

        return ret
Exemple #13
0
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release')
        self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release')
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir

        self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION']
Exemple #14
0
    def build_bin_summary_file_from_binnedcontigs_obj(self, input_ref, bin_dir, bin_basename, fasta_extension):

        # read bin info from obj
        ws = Workspace(self.ws_url)
        try:
            binned_contig_obj = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
        except Exception as e:
            raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()
        bin_summary_info = dict()

        # bid in object is full name of contig fasta file.  want just the number
        for bin_item in binned_contig_obj['bins']:
            #print ("BIN_ITEM[bid]: "+bin_item['bid'])  # DEBUG
            bin_ID = re.sub ('^[^\.]+\.', '', bin_item['bid'].replace('.'+fasta_extension,''))

            #print ("BIN_ID: "+bin_ID)  # DEBUG
            bin_summary_info[bin_ID] = { 'n_contigs': bin_item['n_contigs'],
                                         'gc': round (100.0 * float(bin_item['gc']), 1),
                                         'sum_contig_len': bin_item['sum_contig_len'],
                                         'cov': round (100.0 * float(bin_item['cov']), 1)
                                     }
        # write summary file for just those bins present in bin_dir
        header_line = ['Bin name', 'Completeness', 'Genome size', 'GC content']
        bin_fasta_files_by_bin_ID = self.get_bin_fasta_files(bin_dir, fasta_extension)
        bin_IDs = []
        for bin_ID in sorted(bin_fasta_files_by_bin_ID.keys()):
            bin_ID = re.sub('^[^\.]+\.', '', bin_ID.replace('.'+fasta_extension,''))
            bin_IDs.append(bin_ID)
        summary_file_path = os.path.join (bin_dir, bin_basename+'.'+'summary')

        print ("writing filtered binned contigs summary file "+summary_file_path)
        with open (summary_file_path, 'w') as summary_file_handle:
            print ("\t".join(header_line))
            summary_file_handle.write("\t".join(header_line)+"\n")
            for bin_ID in bin_IDs:
                #print ("EXAMINING BIN SUMMARY INFO FOR BIN_ID: "+bin_ID)  # DEBUG
                bin_summary_info_line = [ bin_basename+'.'+str(bin_ID)+'.'+fasta_extension,
                                          str(bin_summary_info[bin_ID]['cov'])+'%',
                                          str(bin_summary_info[bin_ID]['sum_contig_len']),
                                          str(bin_summary_info[bin_ID]['gc'])
                                      ]
                print ("\t".join(bin_summary_info_line))
                summary_file_handle.write("\t".join(bin_summary_info_line)+"\n")

        return summary_file_path
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.ws = Workspace(self.ws_url, token=self.token)
     self.dfu = DataFileUtil(self.callback_url)
     self.scratch = config['scratch']
    def set_up_test_env(self):

        self.logger.info('setting up test environment...')
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        suffix = int(time.time() * 1000)

        self.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('Templatomatic'):
            self.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = self.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        self.ctx = MethodContext(None)
        self.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_Msuite',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = self.cfg['scratch']
        self.appdir = self.cfg['appdir']

        self.wsURL = self.cfg['workspace-url']
        self.wsClient = Workspace(self.wsURL)
        self.wsName = "test_Templatomatic_" + str(suffix)
        self.ws_info = self.wsClient.create_workspace(
            {'workspace': self.wsName})
        self.ws_id = self.ws_info[6]
        self.dfu = DataFileUtil(self.callback_url)

        self.logger.info('set up new workspace: ' + self.wsName)

        self.env_set_up = True
        self.logger.info('Finished test environment set up')
Exemple #17
0
 def __init__(self, config):
     self.cfg = config
     self.gi = GenomeInterface(config)
     self.dfu = DataFileUtil(config.callbackURL)
     self.aUtil = AssemblyUtil(config.callbackURL)
     self.ws = Workspace(config.workspaceURL)
     self._messages = []
     self.time_string = str(
         datetime.datetime.fromtimestamp(
             time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
     yml_text = open('/kb/module/kbase.yml').read()
     self.version = re.search("module-version:\n\W+(.+)\n",
                              yml_text).group(1)
     self.generate_parents = False
     self.generate_ids = False
     self.genes = OrderedDict()
     self.mrnas = OrderedDict()
     self.cdss = OrderedDict()
     self.noncoding = []
     self.ontologies_present = defaultdict(dict)
     self.ontology_events = list()
     self.skiped_features = Counter()
     self.feature_counts = Counter()
     self.orphan_types = Counter()
     self.contig_seq = {}
     self.circ_contigs = set()
     self.features_spaning_zero = set()
     self.genome_warnings = []
     self.genome_suspect = False
     self.defects = Counter()
     self.spoofed_genes = 0
     self.excluded_features = ('source', 'exon', 'fasta_record')
     self.ont_mappings = load_ontology_mappings('/kb/module/data')
     self.code_table = 11
     self.re_api_url = config.re_api_url
     # dict with feature 'id's that have been used more than once.
     self.used_twice_identifiers = {}
     self.default_params = {
         'source':
         'Genbank',
         'taxon_wsname':
         self.cfg.raw['taxon-workspace-name'],
         'taxon_lookup_obj_name':
         self.cfg.raw['taxon-lookup-object-name'],
         'ontology_wsname':
         self.cfg.raw['ontology-workspace-name'],
         'ontology_GO_obj_name':
         self.cfg.raw['ontology-gene-ontology-obj-name'],
         'ontology_PO_obj_name':
         self.cfg.raw['ontology-plant-ontology-obj-name'],
         'release':
         None,
         'genetic_code':
         11,
         'generate_ids_if_needed':
         0,
         'metadata': {}
     }
Exemple #18
0
    def check_assembly_cache(self, ref, token):
        ws = Workspace(self.ws_url, token=token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(self.assembly_index_dir, inner_chsum + self.ASSEMBLY_SUFFIX + ".tsv.gz")
        if not os.path.isfile(index_file):
            if self.debug:
                print("    Loading WS object...")
                t1 = time.time()

            if 'KBaseGenomeAnnotations.Assembly' in info[2]:
                included = ["/contigs"]
                assembly_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = list(assembly_data['contigs'].values())
                self.save_assembly_tsv(contigs, inner_chsum)

            elif 'KBaseGenomes.ContigSet' in info[2]:
                included = ["/contigs/[*]/id",
                            "/contigs/[*]/length",
                            "/contigs/[*]/md5",
                            "/contigs/[*]/description"]
                cs_data = ws.get_objects2(
                    {'objects': [{'ref': ref, 'included': included}]})['data'][0]['data']
                contigs = []
                for c in cs_data['contigs']:
                    this_contig_data = {'contig_id': ''}
                    if 'id' in c:
                        this_contig_data['contig_id'] = c['id']
                    if 'md5' in c:
                        this_contig_data['md5'] = c['md5']
                    if 'length' in c:
                        this_contig_data['length'] = c['length']
                    if 'description' in c:
                        this_contig_data['description'] = c['description']
                    contigs.append(this_contig_data)

                self.save_assembly_tsv(contigs, inner_chsum)
            else:
                raise ValueError('The "ref" is not an Assembly or ContigSet data object. '
                                 'It was a ' + info[2])

            if self.debug:
                print(f"    (time={time.time() - t1})")
        return inner_chsum
    def create_variation_report(self, params):
        '''
        Create a table report with
        contig_id, length, number_variation, density/mb
        :param variation_ref:
        '''
        ws = Workspace(self.ws_url)

        subset = ws.get_object_subset([{
            'included': ['/numgenotypes', 'numvariants'],
            'ref':
            params['variation_ref']
        }])

        numgenotypes = subset[0]['data']['numgenotypes']
        numvariants = subset[0]['data']['numvariants']

        variation_table = """
        <table>
           <thead>
               <tr>
                   <td>Number of strains/genotypes</td>
                   <td> ##numgenotypes##</td>
               </tr>
            </thead>
            <tbody>
                <tr>
                    <td>Number of variants</td>
                    <td>##numvariants##</td>
                </tr>
            </tbody>
        </table>
        """
        variation_table = variation_table.replace("##numgenotypes##",
                                                  str(numgenotypes))
        variation_table = variation_table.replace("##numvariants##",
                                                  str(numvariants))

        session = str(uuid.uuid4())
        htmlreport_dir = (os.path.join(self.scratch, session))
        os.mkdir(htmlreport_dir)
        index_html_path = os.path.join(htmlreport_dir, "index.html")
        with open(index_html_path, "w") as f:
            f.write(variation_table)
        return (htmlreport_dir)
Exemple #20
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.user_id = config['USER_ID']
     self.dfu = DataFileUtil(self.callback_url)
     self.hs = AbstractHandle(config['handle-service-url'])
     self.ws_client = Workspace(config['workspace-url'])
     self.shock_url = config['shock-url']
 def test_read_narrative_bad_client(self, rqm):
     ws_id = 908
     mock_ws_bad(rqm, "Can't fetch object")
     with self.assertRaises(WorkspaceError) as e:
         read_narrative(
             NarrativeRef.parse("908/1/1"),
             Workspace(url=self.cfg["workspace-url"], token=self.token))
     self.assertIn(str(ws_id), str(e.exception))
     self.assertIn("Can't fetch object", str(e.exception))
Exemple #22
0
 def __init__(self, config):
     self.ws_url = config['workspace-url']
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.scratch = config['scratch']
     self.dfu = DataFileUtil(self.callback_url)
     self.gsu = GenomeSearchUtil(self.callback_url)
     self.ws = Workspace(self.ws_url, token=self.token)
    def export_genome_features_protein_to_fasta(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_features_protein_to_fasta
        print('export_genome_features_protein_to_fasta -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_features_protein_to_fasta - no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_protein_fasta_params = {'genome_ref': params['input_ref']}

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(
            ctx, genome_to_protein_fasta_params)[0]['genbank_file']

        #END export_genome_features_protein_to_fasta

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method export_genome_features_protein_to_fasta return value '
                + 'output is not type dict as required.')
        # return the results
        return [output]
Exemple #24
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_functional_enrichment_1'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_functional_enrichment_1',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.serviceImpl = kb_functional_enrichment_1(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.fe1_runner = FunctionalEnrichmentUtil(cls.cfg)
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.gaa = GenomeAnnotationAPI(cls.callback_url)
        cls.ws = Workspace(cls.wsURL, token=token)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_functional_enrichment_1_" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})

        cls.prepare_data()
 def __init__(self, config):
     self.endpoint = config['kbase-endpoint']
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.ws_client = Workspace(config['workspace-url'])
     self.auth_client = KBaseAuth(config['auth-service-url'])
     logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                         level=logging.INFO)
Exemple #26
0
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('KBaseReport'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'user_id':
         user_id,
         'provenance': [{
             'service': 'KBaseReport',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = Workspace(cls.wsURL)
     cls.serviceImpl = KBaseReport(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     # Custom stuff below
     dirname = os.path.dirname(__file__)
     cls.dfu = DataFileUtil(cls.callback_url)
     cls.a_html_path = os.path.join(cls.scratch, 'a_html')
     cls.b_html_path = os.path.join(cls.scratch, 'b_html')
     shutil.copytree(os.path.join(dirname, 'data', 'a_html'),
                     cls.a_html_path)
     shutil.copytree(os.path.join(dirname, 'data', 'b_html'),
                     cls.b_html_path)
     cls.a_file_path = os.path.join(cls.scratch, 'a.txt')
     cls.b_file_path = os.path.join(cls.scratch, 'b.txt')
     shutil.copy2(os.path.join(dirname, 'data/a.txt'), cls.a_file_path)
     shutil.copy2(os.path.join(dirname, 'data/b.txt'), cls.b_file_path)
     # Upload files to shock
     cls.a_file_shock = cls.dfu.file_to_shock({
         'file_path': cls.a_file_path,
         'make_handle': 0
     })
     cls.b_file_shock = cls.dfu.file_to_shock({
         'file_path': cls.b_file_path,
         'make_handle': 0
     })
 def test_read_narrative_not_narrative(self, rqm):
     ref = "43666/3/1"
     ref_to_file = {ref: "data/43666/report-43666.3.1.json"}
     set_up_ok_mocks(rqm, ref_to_file=ref_to_file)
     with self.assertRaises(ValueError) as e:
         read_narrative(
             NarrativeRef.parse(ref),
             Workspace(url=self.cfg["workspace-url"], token=self.token))
     self.assertIn(
         f"Expected a Narrative object with reference {ref}, got a KBaseReport.Report-3.0",
         str(e.exception))
Exemple #28
0
    def __init__(self, scratch_dir, workspace_url, callback_url, srv_wiz_url,
                 provenance):
        self.scratch_dir = scratch_dir
        self.workspace_url = workspace_url
        self.callback_url = callback_url
        self.srv_wiz_url = srv_wiz_url
        self.provenance = provenance

        # from the provenance, extract out the version to run by exact hash if possible
        self.my_version = 'release'
        if len(provenance) > 0:
            if 'subactions' in provenance[0]:
                self.my_version = self.get_version_from_subactions(
                    'kb_Bwa', provenance[0]['subactions'])
        print('Running kb_Bwa version = ' + self.my_version)

        self.ws = Workspace(self.workspace_url)
        self.bwa = BwaRunner(self.scratch_dir)
        self.parallel_runner = KBParallel(self.callback_url)
        self.qualimap = kb_QualiMap(self.callback_url)
 def test_read_narrative_ok(self, rqm):
     ref = "43666/1/18"
     ref_to_file = {ref: "data/43666/narrative-43666.1.18.json"}
     set_up_ok_mocks(rqm, ref_to_file=ref_to_file)
     nar = read_narrative(
         NarrativeRef.parse("43666/1/18"),
         Workspace(url=self.cfg["workspace-url"], token=self.token))
     # spot check that it's loaded and formatted
     self.assertIsNotNone(nar)
     self.assertIn("cells", nar)
     self.assertEqual(len(nar["cells"]), 9)
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN')
        cls.callbackURL = environ.get('SDK_CALLBACK_URL')
        print('CB URL: ' + cls.callbackURL)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'provenance': [{
                'service': 'kb_unicycler',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_unicycler'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.cfg["SDK_CALLBACK_URL"] = cls.callbackURL
        cls.cfg["KB_AUTH_TOKEN"] = cls.token
        cls.wsURL = cls.cfg['workspace-url']
        cls.shockURL = cls.cfg['shock-url']
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)
        # cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
        cls.wsClient = Workspace(cls.wsURL, token=cls.token)
        wssuffix = int(time.time() * 1000)
        wsName = "test_kb_unicycler_" + str(wssuffix)
        cls.wsinfo = cls.wsClient.create_workspace({'workspace': wsName})
        print('created workspace ' + cls.getWsName())

        cls.PROJECT_DIR = 'unicycler_outputs'
        cls.scratch = cls.cfg['scratch']
        if not os.path.exists(cls.scratch):
            os.makedirs(cls.scratch)
        cls.prjdir = os.path.join(cls.scratch, cls.PROJECT_DIR)
        if not os.path.exists(cls.prjdir):
            os.makedirs(cls.prjdir)
        cls.serviceImpl = kb_unicycler(cls.cfg)

        cls.readUtilsImpl = ReadsUtils(cls.callbackURL, token=cls.token)
        cls.dfuClient = DataFileUtil(url=cls.callbackURL, token=cls.token)
        cls.staged = {}
        cls.nodes_to_delete = []
        cls.handles_to_delete = []
        cls.setupTestData()
        print(
            '\n\n=============== Starting Unicycler tests ==================')
Exemple #31
0
    def check_object_cache(self, ref, search_object, info_included,
                           index_dir, object_suffix, debug):
        ws = Workspace(self.ws_url, token=self.token)
        info = ws.get_object_info3({"objects": [{"ref": ref}]})['infos'][0]
        inner_chsum = info[8]
        index_file = os.path.join(index_dir,
                                  inner_chsum + object_suffix + ".tsv.gz")
        if not os.path.isfile(index_file):
            if debug:
                print("    Loading WS object...")
                t1 = time.time()

            included = self.build_info_included(search_object, info_included)
            object = ws.get_objects2({'objects': [{'ref': ref,
                                                   'included': included}]})['data'][0]['data']
            self.save_object_tsv(object[search_object], inner_chsum, info_included,
                                 index_dir, object_suffix)
            if debug:
                print("    (time=" + str(time.time() - t1) + ")")
        return inner_chsum
def verify_public_narrative(workspace_url: str, ws_id: int) -> None:
    """
    Raises a PermissionError if the workspace is not public (i.e. user '*' has 'r' access).
    Creating a stating Narrative is only permitted on public Narratives.
    If the Narrative is public, this returns None.

    Raises a WorkspaceError if anything goes wrong with the lookup.

    :param workspace_url: str - the workspace endpoint url
    :param ws_id: int - the workspace to check
    """
    ws_client = Workspace(url=workspace_url)
    try:
        perms = ws_client.get_permissions({"id": ws_id})
    except ServerError as err:
        raise WorkspaceError(err, ws_id)
    if perms.get("*", "n") not in ["r", "w", "a"]:
        err = f"Workspace {ws_id} must be publicly readable to make a Static Narrative"
        logging.getLogger("StaticNarrative").error(err)
        raise PermissionError(err)
 def __init__(self, config):
     os.makedirs(self.workdir, exist_ok=True)
     self.config = config
     self.timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.genome_api = GenomeAnnotationAPI(self.callback_url)
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url)
     self.kbr = KBaseReport(self.callback_url)
     self.ws_client = Workspace(config["workspace-url"])
Exemple #34
0
 def __init__(self, config):
     self.cfg = config
     self.scratch = config['scratch']
     self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
     self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     self.ws = Workspace(config["workspace-url"])
class AveExpressionMatrixBuilder:

    def _validate_calculate_average_expression_matrix_params(self, params):
        """
        _validate_calculate_average_expression_matrix_params:
                validates params passed to calculate_average_expression_matrix method
        """

        log('start validating calculate_average_expression_matrix params')

        # check for required parameters
        for p in ['expression_matrix_ref', 'output_suffix', 'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def _generate_report(self, expression_matrix_ref, workspace_name):
        """
        _generate_report: generate report
        """

        objects_created = [{'ref': expression_matrix_ref,
                            'description': 'Average ExpressionMatrix'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         # 'html_links': output_html_files,
                         # 'direct_html_link_index': 0,
                         'html_window_height': 366,
                         'report_object_name': 'kb_ave_expr_matrix_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _save_expression_matrix(self, em_data, em_obj_name, workspace_name):
        """
        _save_expression_matrix: saving ExpressionMatrix
        """

        try:
            log('saving ExpressionMatrix [{}]'.format(em_obj_name))
        
            data_type = 'KBaseFeatureValues.ExpressionMatrix'
            obj_info = self.dfu.save_objects({'id': self.dfu.ws_name_to_id(workspace_name),
                                              'objects': [{'type': data_type,
                                                           'data': em_data,
                                                           'name': em_obj_name}]})[0]
        except Exception as e:
            log(e)
            raise Exception('Failed Saving ExpressionMatrix to Workspace')

        expression_matrix_ref = str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

        return expression_matrix_ref

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.scratch = config['scratch']

    def calculate_average_expression_matrix(self, params):
        """
        calculate_average_expression_matrix: create an average ExpressionMatrix object 
                                             from a ExpressionMatrix object

        required params:
        expression_matrix_ref: ExpressionMatrix object reference
        output_suffix: output average ExpressionMatrix name suffix
        workspace_name: the name of the workspace it gets saved to
        
        return:
        average_expression_matrix_ref: generated average ExpressionMatrix object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning AveExpressionMatrixBuilder.calculate_average_expression_matrix\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_calculate_average_expression_matrix_params(params)

        expression_matrix_ref = params.get('expression_matrix_ref')
        expression_matrix = self.ws.get_objects2({'objects':
                                                  [{'ref': 
                                                    expression_matrix_ref}]})['data'][0]

        expression_matrix_data = expression_matrix['data']
        expression_matrix_info = expression_matrix['info']

        condition_map = expression_matrix_data['condition_mapping']

        ori_data = expression_matrix_data['data']
        ori_col_ids = ori_data['col_ids']
        ori_row_ids = ori_data['row_ids']
        ori_values = ori_data['values']

        labels = list(condition_map.keys())

        if set(labels) != set(ori_col_ids):
            error_msg = 'available labels: {}\n'.format(ori_col_ids)
            error_msg += 'labels in condition_mapping: {}'.format(labels)
            raise ValueError(error_msg)

        condition_pos = {}

        for label, condition in condition_map.items():
            if condition not in condition_pos:
                condition_pos.update({condition: [ori_col_ids.index(label)]})
            else:
                condition_list = condition_pos[condition]
                condition_list.append(ori_col_ids.index(label))
                condition_pos.update({condition: condition_list})

        conditions = list(condition_pos.keys())

        ave_values = []
        for ori_value in ori_values:
            ave_value = [None] * len(conditions)
            for condition, poss in condition_pos.items():
                ave_pos = conditions.index(condition)
                sum_value = 0.0
                for pos in poss:
                    sum_value += round(float(ori_value[pos]), 3) 
                average = sum_value / len(poss)
                ave_value[ave_pos] = round(average, 2)

            ave_values.append(ave_value)

        average_data = {}
        average_data.update({'row_ids': ori_row_ids})
        average_data.update({'col_ids': conditions})
        average_data.update({'values': ave_values})

        em_data = {}
        genome_ref = expression_matrix_data.get('genome_ref')
        if genome_ref:
            em_data.update({'genome_ref': genome_ref})
        em_data.update({'scale': expression_matrix_data.get('scale')})
        em_data.update({'type': expression_matrix_data.get('type')})
        em_data.update({'feature_mapping': expression_matrix_data.get('feature_mapping')})
        em_data.update({'condition_mapping': expression_matrix_data.get('condition_mapping')})
        em_data.update({'data': average_data})

        expression_matrix_name = expression_matrix_info[1]
        ave_expression_matrix_name = expression_matrix_name + params.get('output_suffix')

        workspace_name = params.get('workspace_name')

        ave_expression_matrix_ref = self._save_expression_matrix(em_data, 
                                                                 ave_expression_matrix_name, 
                                                                 workspace_name)

        returnVal = {'average_expression_matrix_ref': ave_expression_matrix_ref}

        report_output = self._generate_report(ave_expression_matrix_ref,
                                              workspace_name)
        returnVal.update(report_output)

        return returnVal
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
Exemple #37
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(sorted(feat['aliases'].keys())),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}