def test_handles(self):
     wsName = self.generatePesudoRandomWorkspaceName()
     self.ws.set_permissions({'workspace': wsName, 'new_permission': 'w',
                              'users': [self.ctx2['user_id']]})
     temp_shock_file = "/kb/module/work/tmp/shock1.txt"
     with open(temp_shock_file, "w") as f1:
         f1.write("Test Shock Handle")
     token1 = self.ctx['token']
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token1)
     handle1 = dfu.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})['handle']
     hid1 = handle1['hid']
     genome_name = "Genome.1"
     self.impl.save_one_genome_v1(self.ctx, {
         'workspace': wsName, 'name': genome_name, 'data': {
             'id': "qwerty", 'scientific_name': "Qwerty",
             'domain': "Bacteria", 'genetic_code': 11,
             'genbank_handle_ref': hid1}
         })
     genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name}
                                                              ]})[0]['genomes'][0]['data']
     self.impl.save_one_genome_v1(self.ctx2, {'workspace': wsName, 'name': genome_name,
                                              'data': genome})[0]
     genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name}
                                                              ]})[0]['genomes'][0]['data']
     self.assertTrue('genbank_handle_ref' in genome)
     hid2 = genome['genbank_handle_ref']
     self.assertNotEqual(hid1, hid2)
Beispiel #2
0
    def setUpClass(cls):
        cls.token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenericsAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            cls.token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'GenericsAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = GenericsAPI(cls.cfg)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.shockURL = cls.cfg['shock-url']
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.sample_uploader = sample_uploader(cls.callback_url,
                                              service_ver="dev")
        cls.sample_url = cls.cfg.get('kbase-endpoint') + '/sampleservice'
        cls.sample_ser = SampleService(cls.sample_url)
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_GenericsAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.wsId = ret[0]

        small_file = os.path.join(cls.scratch, 'test.txt')
        with open(small_file, "w") as f:
            f.write("empty content")
        cls.test_shock = cls.dfu.file_to_shock({
            'file_path': small_file,
            'make_handle': True
        })
        cls.handles_to_delete = []
        cls.nodes_to_delete = []
        cls.handles_to_delete.append(cls.test_shock['handle']['hid'])
        cls.nodes_to_delete.append(cls.test_shock['shock_id'])

        cls.prepare_data()
 def setUpClass(cls):
     cls.token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)  # type: ignore
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     authServiceUrl = cls.cfg.get(
         'auth-service-url',
         "https://kbase.us/services/authorization/Sessions/Login")
     auth_client = _KBaseAuth(authServiceUrl)
     cls.user_id = auth_client.get_user(cls.token)
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         cls.token,
         'user_id':
         cls.user_id,
         'provenance': [{
             'service': 'GenomeFileUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token)
     cls.scratch = cls.cfg['scratch']
     cls.shockURL = cls.cfg['shock-url']
     cls.gfu_cfg = SDKConfig(cls.cfg)
     cls.prepare_data()
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None
Beispiel #5
0
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.sp_uploader = sample_uploader(self.callback_url,
                                           service_ver='beta')
        self.dotfu = KBaseDataObjectToFileUtils(self.callback_url,
                                                token=self.token,
                                                service_ver='beta')
Beispiel #6
0
    def __init__(self, config):

        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url, service_ver='release')
        self.working_dir = self.scratch

        self.data_util = DataUtil(config)
        self.dfu = DataFileUtil(self.callback_url)
        self.output_dir = os.path.join(self.working_dir, self.MDS_OUT_DIR)
        self._mkdir_p(self.output_dir)

        # If input is from files, then pd.DataFrame needs to be transposed in run_metaMDS_with_file method
        self.need_to_transpose = True
class geneminerutils:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        #self.hr = htmlreportutils()
        #self.config = config
        #self.params = params

    def download_genelist(self, genelistref):
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        #geneset_query = ",".join(geneset)
        return (geneset['element_ordering'])
      #  #with open(genesetfile, 'w') as filehandle:
      #      #for item in geneset['element_ordering']:
      #       #   filehandle.write('%s\n' % item)
      #  #return (genesetfile)

    def generate_query(self, genomenetmine_dyn_url, genelistref, species, pheno):
        #pheno = ["disease"]
        #species = "potatoknet"
        #genes = ["PGSC0003DMG400006345", "PGSC0003DMG400012792", "PGSC0003DMG400033029", "PGSC0003DMG400016390",
        #         "PGSC0003DMG400039594", "PGSC0003DMG400028153"]
        #genomenetmine_dyn_url = 'http://ec2-18-236-212-118.us-west-2.compute.amazonaws.com:5000/networkquery/api'
        genes = self.download_genelist(genelistref)
        gsp = genescoreparser()
        x = gsp.summary(genomenetmine_dyn_url, genes, species, pheno)
        return (x)

    def get_evidence(self,genomenetmine_dyn_url, genelistref, species, pheno ):
        genes = self.download_genelist(genelistref)
        ep = evidenceparser()
        x = ep.summary(genomenetmine_dyn_url, genes, species, pheno)
        return (x)
 def setUpClass(cls):
     token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('kb_orthofinder'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'kb_orthofinder',
                          'method': 'annotate_plant_transcripts',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL)
     cls.serviceImpl = kb_orthofinder(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.test_data = cls.cfg['test_data']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.gfu = GenomeFileUtil(cls.callback_url)
     cls.dfu = DataFileUtil(cls.callback_url)
     cls.genome = "Test_Genome"
     cls.prepare_data()
Beispiel #9
0
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('ProteinStructureUtils'):
         cls.cfg[nameval[0]] = nameval[1]
     # Getting username from Auth profile for token
     authServiceUrl = cls.cfg['auth-service-url']
     auth_client = _KBaseAuth(authServiceUrl)
     user_id = auth_client.get_user(token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({'token': token,
                     'user_id': user_id,
                     'provenance': [
                         {'service': 'ProteinStructureUtils',
                          'method': 'please_never_use_it_in_production',
                          'method_params': []
                          }],
                     'authenticated': 1})
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = Workspace(cls.wsURL)
     cls.serviceImpl = ProteinStructureUtils(cls.cfg)
     cls.scratch = cls.cfg['scratch']
     cls.callback_url = os.environ['SDK_CALLBACK_URL']
     cls.dfu = DataFileUtil(cls.callback_url)
     suffix = int(time.time() * 1000)
     cls.wsName = "test_ProteinStructureUtils_" + str(suffix)
     cls.ws_id = cls.wsClient.create_workspace({'workspace': cls.wsName})[0]
     cls.prepareData()
 def setUpClass(cls):
     cls.token = environ.get('KB_AUTH_TOKEN', None)
     config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('kb_uploadmethods'):
         cls.cfg[nameval[0]] = nameval[1]
     authServiceUrl = cls.cfg.get(
         'auth-service-url',
         "https://kbase.us/services/authorization/Sessions/Login")
     auth_client = _KBaseAuth(authServiceUrl)
     cls.user_id = auth_client.get_user(cls.token)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         cls.token,
         'user_id':
         cls.user_id,
         'provenance': [{
             'service': 'kb_uploadmethods',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=cls.token)
     cls.serviceImpl = kb_uploadmethods(cls.cfg)
     cls.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=cls.token)
     cls.scratch = cls.cfg['scratch']
     cls.shockURL = cls.cfg['shock-url']
Beispiel #11
0
    def test_basic_upload_and_download(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'MyNewAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name,
                                                        'taxon_ref': 'ReferenceTaxons/unknown_taxon',
                                                        })
        pprint(result)
        self.check_fasta_file(ws_obj_name, fasta_path)
        return


        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shock_id = data_file_cli.file_to_shock({'file_path': fasta_path})['shock_id']
        ws_obj_name2 = 'MyNewAssembly.2'
        result2 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'shock_id': shock_id,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name2
                                                         })
        pprint(result2)
        self.check_fasta_file(ws_obj_name2, fasta_path)

        print('attempting upload via ftp url')
        ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz'
        ws_obj_name3 = 'MyNewAssembly.3'
        result3 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'ftp_url': ftp_url,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name3
                                                         })
        pprint(result3)
        # todo: add checks here on ws object

        ws_obj_name3 = 'MyNewAssembly.3'
        result4 = assemblyUtil.export_assembly_as_fasta(self.getContext(),
                                                        {'input_ref': self.getWsName() + '/' + ws_obj_name3})
        pprint(result4)
Beispiel #12
0
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
        function for creating html report
        :param callback_url:
        :param output_dir:
        :param workspace_name:
        :return:
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_variant_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)
        index_file_path = output_dir + "/snpEff_genes.txt"
        htmlstring = self.create_enrichment_report("snpEff_genes.txt",
                                                   output_dir)

        try:
            with open(output_dir + "/index.html", "w") as html_file:
                html_file.write(htmlstring + "\n")
        except IOError:
            print("Unable to write " + index_file_path + " file on disk.")

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
Beispiel #13
0
    def test_AssemblySet_input(self):

        # Initiate empty data dictionaries and get data_util
        dfu = DataFileUtil(self.callback_url)
        assembly_dict = dict()
        assembly_set_dict = dict()
        dfu_dict = dict()
        dfu_dict_2 = dict()
        # Get workspace id and name
        wsName = self.getWsName()
        ws_id = dfu.ws_name_to_id(wsName)

        # FASTA to assembly object
        Fasta_assembly_dict = {
            "path": "/kb/module/work/tmp/NC_021490.fasta",
            "assembly_name": "test_assembly"
        }
        params = {
            "file": Fasta_assembly_dict,
            "workspace_name": wsName,
            "assembly_name": "test_assembly"
        }
        ref = self.getImpl().save_assembly_from_fasta(self.ctx, params)

        # Create assembly data dictionaries
        assembly_dict.update({"label": "assemblySetTest", "ref": ref[0]})
        assembly_set_dict.update({
            "description": " ",
            "items": [assembly_dict]
        })
        # Create DataFileUtil dictionaries
        dfu_dict.update({
            "type": "KBaseSets.AssemblySet",
            "data": assembly_set_dict,
            "name": "Assembly_Test"
        })
        dfu_dict_2.update({'id': ws_id, 'objects': [dfu_dict]})

        # Create assembly set object
        assembly_set_obj = dfu.save_objects(dfu_dict_2)
        assembly_set_ref = [
            str(assembly_set_obj[0][6]) + '/' + str(assembly_set_obj[0][0]) +
            '/' + str(assembly_set_obj[0][4])
        ]

        # Get FASTA
        ret = self.getImpl().get_fastas(self.callback_url, assembly_set_ref)
 def setUpClass(cls):
     token = os.environ.get('KB_AUTH_TOKEN', None)
     # WARNING: don't call any logging methods on the context object,
     # it'll result in a NoneType error
     cls.ctx = MethodContext(None)
     cls.ctx.update({
         'token':
         token,
         'provenance': [{
             'service': 'GenomeFileUtil',
             'method': 'please_never_use_it_in_production',
             'method_params': []
         }],
         'authenticated':
         1
     })
     config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
     cls.cfg = {}
     config = ConfigParser()
     config.read(config_file)
     for nameval in config.items('GenomeFileUtil'):
         cls.cfg[nameval[0]] = nameval[1]
     cls.wsURL = cls.cfg['workspace-url']
     cls.wsClient = workspaceService(cls.wsURL, token=token)
     cls.serviceImpl = GenomeFileUtil(cls.cfg)
     gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff"
     ws_obj_name = 'ecoli_genome'
     suffix = int(time.time() * 1000)
     cls.wsName = "test_GenomeFileUtil_" + str(suffix)
     ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
     result = cls.serviceImpl.genbank_to_genome(
         cls.ctx, {
             'file': {
                 'path': gbk_path
             },
             'workspace_name': cls.wsName,
             'genome_name': ws_obj_name,
             'generate_ids_if_needed': 1,
             'source': "RefSeq Reference"
         })[0]
     #        print("HERE IS THE RESULT:")
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                                  token=cls.ctx['token'],
                                  service_ver='dev')
     genome = data_file_cli.get_objects(
         {'object_refs': [result['genome_ref']]})['data'][0]['data']
     cls.assembly_ref = genome["assembly_ref"]
    def setUpClass(cls):
        cls.maxDiff = 70000
        cls.token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenericsAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(cls.token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': cls.token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'GenericsAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL)
        cls.serviceImpl = GenericsAPI(cls.cfg)
        cls.serviceUtils = AttributesUtil(cls.cfg)
        cls.shockURL = cls.cfg['shock-url']
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.hs = HandleService(url=cls.cfg['handle-service-url'],
                               token=cls.token)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_CompoundSetUtils_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.wsId = ret[0]
        cls.attribute_mapping = json.load(open('data/AM1.json'))
        info = cls.dfu.save_objects({
            "id": cls.wsId,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": cls.attribute_mapping,
                "name": "test_cond_set"
            }]
        })[0]
        cls.attribute_mapping_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        cls.attribute_mapping_2 = json.load(open('data/AM2.json'))

        small_file = os.path.join(cls.scratch, 'test.txt')
        with open(small_file, "w") as f:
            f.write("empty content")
        cls.test_shock = cls.dfu.file_to_shock({'file_path': small_file, 'make_handle': True})
        cls.handles_to_delete = []
        cls.nodes_to_delete = []
        cls.handles_to_delete.append(cls.test_shock['handle']['hid'])
        cls.nodes_to_delete.append(cls.test_shock['shock_id'])
class SampleServiceUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.srv_wiz_url = config['srv-wiz-url']
        self.sample_url = config.get('kbase-endpoint') + '/sampleservice'
        self.dfu = DataFileUtil(self.callback_url)
        self.sample_ser = SampleService(self.sample_url)

        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)

    def get_sample_service_url(self):
        return self.sample_url

    def get_sample(self, sample_id, version=None):

        sample_url = self.get_sample_service_url()
        headers = {"Authorization": self.token}
        params = {"id": sample_id, "version": version}
        payload = {
            "method": "SampleService.get_sample",
            "id": str(uuid.uuid4()),
            "params": [params],
            "version": "1.1"
        }
        resp = requests.post(url=sample_url,
                             headers=headers,
                             data=json.dumps(payload))
        resp_json = resp.json()
        if resp_json.get('error'):
            raise RuntimeError(
                f"Error from SampleService - {resp_json['error']}")
        sample = resp_json['result'][0]

        # sample = self.sample_ser.get_sample(params)[0]

        return sample

    def get_ids_from_samples(self, sample_set_ref):
        logging.info('start retrieving sample ids from sample set')

        sample_set = self.dfu.get_objects({"object_refs": [sample_set_ref]
                                           })['data'][0]['data']

        samples = sample_set['samples']

        data_ids = []
        for sample in samples:
            sample_id = sample.get('id')
            version = sample.get('version')

            sample_data = self.get_sample(sample_id, version=version)

            data_id = sample_data['name']
            data_ids.append(data_id)

        return data_ids
Beispiel #17
0
 def __init__(self, config, ctx):
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.re_api = RE_API(config['re-url'], ctx['token'])
     self.dfu = DataFileUtil(self.callback_url)
     self.kbse = KBaseSearchEngine(config['search-url'])
     self.kbr = KBaseReport(self.callback_url)
     self.object_categories = ['Narrative', 'Genome', 'FBAModel', 'Tree']
 def test_gff_and_metagenome_to_metagenome(self):
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     result = self.serviceImpl.ws_obj_gff_to_metagenome(self.ctx, {
         'workspace_name': self.wsName,
         'genome_name': 'MyGenome',
         'gff_file': {'path': self.gff_path},
         'ws_ref': self.metagenome_ref,
         'source': 'GFF',
         'type': 'Reference',
         'genome_type': 'Metagenome',
         'is_metagenome': True,
         'generate_missing_genes': True,
         'taxon_id': '3702',
     })[0]
     metagenome = dfu.get_objects({'object_refs': [result['metagenome_ref']]})['data'][0]['data']
     # make sure its same as original
     self._compare_features(self.genome_orig, metagenome)
Beispiel #19
0
def update_clients():
    callback_url = os.environ['SDK_CALLBACK_URL']
    Var.update(
        dfu=DataFileUtil(callback_url),
        kbr=KBaseReport(callback_url),
        fpu=FunctionalProfileUtil(callback_url, service_ver='dev'),
        gapi=GenericsAPI(callback_url, service_ver='dev'),
    )
Beispiel #20
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.report_util = kb_GenericsReport(self.callback_url)
     self.data_util = DataUtil(config)
     self.sampleservice_util = SampleServiceUtil(config)
     self.attr_util = AttributesUtil(config)
     self.matrix_util = MatrixUtil(config)
     self.matrix_types = [
         x.split(".")[1].split('-')[0]
         for x in self.data_util.list_generic_types()
     ]
     self.taxon_wsname = config['taxon-workspace-name']
     self.kbse = KBaseSearchEngine(config['search-url'])
     self.taxon_cache = dict()
Beispiel #21
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.ru = ReadsUtils(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.mgu = MetagenomeUtils(self.callback_url)
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.ws = Workspace(self.ws_url, token=self.token)
     self.dfu = DataFileUtil(self.callback_url)
     self.scratch = config['scratch']
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     # self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.report_util = kb_GenericsReport(self.callback_url)
     logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                         level=logging.INFO)
Beispiel #24
0
    def create_html_report(self, callback_url, output_dir, workspace_name, objects_created):
        '''
         function for creating html report
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_gsea_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)
        # report_dir = "localhost"
        # htmlstring = "<a href=" + report_dir + "/jbrowse/index.html>report link</a>"
        # htmlstring = "<a href='./igv_output/index.html'>report link</a>"
        # index_file_path = output_dir + "snp_eff/snpEff_summary.html"
        # html_file = open(index_file_path, "wt")
        # n = html_file.write(htmlstring)
        # html_file.close()
        # Source path
        # source = "/kb/module/deps/jbrowse"

        # Destination path
        # destination = output_dir +"/jbrowse"

        # dest = shutil.copytree(source, destination)
        # os.system("cp -r " + source +" "+ destination)
        report_name = 'VariationAnnotationReport' + str(uuid.uuid4())

        report_shock_id = dfu.file_to_shock({'file_path': output_dir,
                                            'pack': 'zip'})['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name':  "index.html",
            'label': 'index.html',
            'description': 'HTMLL report for VariationAnnotation'
            }

        report_info = report.create_extended_report({
                        'objects_created': objects_created,
                        'direct_html_link_index': 0,
                        'html_links': [html_file],
                        'report_object_name': report_name,
                        'workspace_name': workspace_name
                    })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
class genelistutil:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def download_genelist(self, genelistref, genesetfile):
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        with open(genesetfile, 'w') as filehandle:
            for item in geneset['element_ordering']:
                filehandle.write('%s\n' % item)

    def listToString(self, s):

        # initialize an empty string
        str1 = ""

        # traverse in the string
        for ele in s:
            str1 += ele

        # return string
        return str1

    # function to get unique values
    def unique(self, list1):
        # insert the list to the set
        list_set = set(list1)
        # convert the set to the list
        unique_list = (list(list_set))
        for x in unique_list:
            print(x)

    def get_genomeid_from_featuresetid(self, genelistref):
        genome = {}
        get_objects_params = {'object_refs': [genelistref]}
        geneset = self.dfu.get_objects(get_objects_params)['data'][0]['data']
        print(type(geneset['elements'].values()))
        for k, v in geneset['elements'].items():
            print(self.listToString(v))
            genome[self.listToString(v)] = 1
        if (len(genome) != 1):
            exit("source of genome is not unique\n")
        else:
            return (list(genome.values())[0])
Beispiel #26
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.data_util = DataUtil(config)
     self.matrix_types = [x.split(".")[1].split('-')[0]
                          for x in self.data_util.list_generic_types()]
Beispiel #27
0
    def UploadFromMdscan(self, callback_url, params):
        """
          :param params: instance of type "UploadmfmdInParams" -> structure:
             parameter "path" of String, parameter "ws_name" of String,
             parameter "obj_name" of String
          :returns: instance of type "UploadOutput" -> structure: parameter
             "obj_ref" of String
          """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMdscan
        print('Extracting motifs')
        motifList = self.parse_mdscan_output(params['path'])
        print(motifList)

        MSO = {}
        MSO = motifList

        dfu = DataFileUtil(callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #exit("test")
        #END UploadFromMdscan

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFrommfmd return value ' +
                             'output is not type dict as required.')

        # return the results
        return [output]
Beispiel #28
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_deseq'):
            cls.cfg[nameval[0]] = nameval[1]
        # Getting username from Auth profile for token
        authServiceUrl = cls.cfg['auth-service-url']
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'kb_deseq',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = Workspace(cls.wsURL)
        cls.ws = Workspace(cls.wsURL, token=token)
        cls.serviceImpl = kb_deseq(cls.cfg)
        cls.serviceImpl.status(cls.ctx)
        cls.scratch = cls.cfg['scratch']
        cls.callback_url = os.environ['SDK_CALLBACK_URL']

        cls.gfu = GenomeFileUtil(cls.callback_url, service_ver='dev')
        cls.dfu = DataFileUtil(cls.callback_url)
        cls.ru = ReadsUtils(cls.callback_url)
        cls.rau = ReadsAlignmentUtils(cls.callback_url)
        cls.stringtie = kb_stringtie(cls.callback_url)
        cls.eu = ExpressionUtils(cls.callback_url)
        cls.deseq_runner = DESeqUtil(cls.cfg)

        suffix = int(time.time() * 1000)
        cls.wsName = "test_kb_stringtie_" + str(suffix)
        cls.wsClient.create_workspace({'workspace': cls.wsName})
        cls.dfu.ws_name_to_id(cls.wsName)
        # public on CI
        cls.expressionset_ref = '30957/52/41'
        cls.condition_1 = 'Ecoli_WT'
        cls.condition_2 = 'Ecoli_ydcR'

        # public on Appdev
        cls.expressionset_ref = '60454/19'
        cls.condition_1 = 'WT'
        cls.condition_2 = 'Hy5'
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)

        # set up directory for files folder
        self.output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        os.mkdir(self.output_dir)
        self.files_folder = os.path.join(self.output_dir, 'files')
        os.mkdir(self.files_folder)

        self.file_paths = []
        self.html_paths = []

        self.GenAPI = GenericsAPI(self.callback_url)
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.ws = Workspace(self.ws_url, token=self.token)
     self.dfu = DataFileUtil(self.callback_url)
     self.gsu = GenomeSearchUtil(self.callback_url)
     self.scratch = config['scratch']
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.workspaceURL = config['workspace-url']
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = DataFileUtil(self.callback_url)
     self.scratch = os.path.abspath(config['scratch'])
     self.config = config
     #END_CONSTRUCTOR
     pass
Beispiel #32
0
def upload_pangenome(cb_url, scratch, Pangenome, workspace_name,
                     pangenome_name):
    """
    params:
        cb_url         : callback url
        scratch        : folder path to Pangenome object 
        pangenome      : KBaseGenomes.Pangenome like object
        workspace_name : workspace name
        pangenome_name : Pangenome display name
    Returns:
        pangenome_ref: Pangenome workspace reference
        pangenome_info: info on pangenome object
    """
    dfu = DataFileUtil(cb_url)
    meta = {}
    hidden = 0

    # dump pangenome to scratch for upload
    # data_path = os.path.join(scratch, pangenome_name + '.json')
    # json.dump(pangenome, open(data_path, 'w'))

    if isinstance(workspace_name, int) or workspace_name.isdigit():
        workspace_id = workspace_name
    else:
        workspace_id = dfu.ws_name_to_id(workspace_name)

    save_params = {
        'id':
        workspace_id,
        'objects': [{
            'type': 'KBaseGenomes.Pangenome',
            'data': Pangenome,
            'name': pangenome_name,
            'meta': meta,
            'hidden': hidden
        }]
    }

    info = dfu.save_objects(save_params)[0]

    ref = "{}/{}/{}".format(info[6], info[0], info[4])
    print("Pangenome saved to {}".format(ref))

    return {'pangenome_ref': ref, 'pangenome_info': info}
Beispiel #33
0
    def run_Gblocks(self, ctx, params):
        """
        Method for trimming MSAs of either DNA or PROTEIN sequences
        **
        **        input_type: MSA
        **        output_type: MSA
        :param params: instance of type "Gblocks_Params" (Gblocks Input
           Params) -> structure: parameter "workspace_name" of type
           "workspace_name" (** The workspace object refs are of form: ** ** 
           objects = ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "desc" of String, parameter "input_ref" of type
           "data_obj_ref", parameter "output_name" of type "data_obj_name",
           parameter "trim_level" of Long, parameter "min_seqs_for_conserved"
           of Long, parameter "min_seqs_for_flank" of Long, parameter
           "max_pos_contig_nonconserved" of Long, parameter "min_block_len"
           of Long, parameter "remove_mask_positions_flag" of Long
        :returns: instance of type "Gblocks_Output" (Gblocks Output) ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN run_Gblocks
        console = []
        invalid_msgs = []
        self.log(console,'Running run_Gblocks with params=')
        self.log(console, "\n"+pformat(params))
        report = ''
#        report = 'Running run_Gblocks with params='
#        report += "\n"+pformat(params)


        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'input_ref' not in params:
            raise ValueError('input_ref parameter is required')
        if 'output_name' not in params:
            raise ValueError('output_name parameter is required')


        #### Get the input_ref MSA object
        ##
        try:
            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            objects = ws.get_objects([{'ref': params['input_ref']}])
            data = objects[0]['data']
            info = objects[0]['info']
            input_name = info[1]
            input_type_name = info[2].split('.')[1].split('-')[0]

        except Exception as e:
            raise ValueError('Unable to fetch input_ref object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()

        if input_type_name == 'MSA':
            MSA_in = data
            row_order = []
            default_row_labels = dict()
            if 'row_order' in MSA_in.keys():
                row_order = MSA_in['row_order']
            else:
                row_order = sorted(MSA_in['alignment'].keys())

            if 'default_row_labels' in MSA_in.keys():
                default_row_labels = MSA_in['default_row_labels']
            else:
                for row_id in row_order:
                    default_row_labels[row_id] = row_id
            if len(row_order) < 2:
                self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref'])

            # export features to FASTA file
            input_MSA_file_path = os.path.join(self.scratch, input_name+".fasta")
            self.log(console, 'writing fasta file: '+input_MSA_file_path)
            records = []
            for row_id in row_order:
                #self.log(console,"row_id: '"+row_id+"'")  # DEBUG
                #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'")  # DEBUG
            # using SeqIO makes multiline sequences.  (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code)
                #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id])
                #records.append(record)
            #SeqIO.write(records, input_MSA_file_path, "fasta")
                records.extend(['>'+row_id,
                                MSA_in['alignment'][row_id]
                               ])
            with open(input_MSA_file_path,'w',0) as input_MSA_file_handle:
                input_MSA_file_handle.write("\n".join(records)+"\n")


            # Determine whether nuc or protein sequences
            #
            NUC_MSA_pattern = re.compile("^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$")
            all_seqs_nuc = True
            for row_id in row_order:
                #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'")
                if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None:
                    all_seqs_nuc = False
                    break

        # Missing proper input_type
        #
        else:
            raise ValueError('Cannot yet handle input_ref type of: '+type_name)


        # DEBUG: check the MSA file contents
#        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA_LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA_LINE: '"+line+"'")


        # validate input data
        #
        N_seqs = 0
        L_first_seq = 0
        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
            for line in input_MSA_file_handle:
                if line.startswith('>'):
                    N_seqs += 1
                    continue
                if L_first_seq == 0:
                    for c in line:
                        if c != '-' and c != ' ' and c != "\n":
                            L_first_seq += 1
        # min_seqs_for_conserved
        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            if int(params['min_seqs_for_conserved']) < int(0.5*N_seqs)+1:
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be >= N/2+1 (N="+str(N_seqs)+", N/2+1="+str(int(0.5*N_seqs)+1)+")\n")
            if int(params['min_seqs_for_conserved']) > int(params['min_seqs_for_flank']):
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be <= Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+")\n")

        # min_seqs_for_flank
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            if int(params['min_seqs_for_flank']) > N_seqs:
                self.log(invalid_msgs,"Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+") must be <= N (N="+str(N_seqs)+")\n")

        # max_pos_contig_nonconserved
        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) != 0:
            if int(params['max_pos_contig_nonconserved']) < 0:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be >= 0"+"\n")
            if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(params['max_pos_contig_nonconserved']) >= 32000:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # min_block_len
        if 'min_block_len' in params and params['min_block_len'] != None and int(params['min_block_len']) != 0:
            if int(params['min_block_len']) < 2:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be >= 2"+"\n")
            if int(params['min_block_len']) > L_first_seq or int(params['min_block_len']) >= 32000:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # trim_level
        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            if int(params['trim_level']) < 0 or int(params['trim_level']) > 2:
                self.log(invalid_msgs,"Trim Level ("+str(params['trim_level'])+") must be >= 0 and <= 2"+"\n")


        if len(invalid_msgs) > 0:

            # load the method provenance from the context object
            self.log(console,"SETTING PROVENANCE")  # DEBUG
            provenance = [{}]
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = []
            provenance[0]['input_ws_objects'].append(params['input_ref'])
            provenance[0]['service'] = 'kb_gblocks'
            provenance[0]['method'] = 'run_Gblocks'

            # report
            report += "FAILURE\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report_obj_info = ws.save_objects({
#                'id':info[6],
                'workspace':params['workspace_name'],
                'objects':[
                    {
                        'type':'KBaseReport.Report',
                        'data':reportObj,
                        'name':reportName,
                        'meta':{},
                        'hidden':1,
                        'provenance':provenance
                    }
                ]
            })[0]


            self.log(console,"BUILDING RETURN OBJECT")
            returnVal = { 'report_name': reportName,
                          'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4])
#                          'output_ref': None
                          }
            self.log(console,"run_Gblocks DONE")
            return [returnVal]


        ### Construct the command
        #
        #  e.g.
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks
        #
        gblocks_cmd = [self.GBLOCKS_bin]

        # check for necessary files
        if not os.path.isfile(self.GBLOCKS_bin):
            raise ValueError("no such file '"+self.GBLOCKS_bin+"'")
        if not os.path.isfile(input_MSA_file_path):
            raise ValueError("no such file '"+input_MSA_file_path+"'")
        if not os.path.getsize(input_MSA_file_path) > 0:
            raise ValueError("empty file '"+input_MSA_file_path+"'")

        # DEBUG
#        with open(input_MSA_file_path,'r',0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA LINE: '"+line+"'")


        # set the output path
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        output_dir = os.path.join(self.scratch,'output.'+str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Gblocks names output blocks MSA by appending "-gb" to input file
        #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb')
        output_GBLOCKS_file_path = input_MSA_file_path+'-gb'
        output_aln_file_path = output_GBLOCKS_file_path

        # Gblocks is interactive and only accepts args from pipe input
        #if 'arg' in params and params['arg'] != None and params['arg'] != 0:
        #    fasttree_cmd.append('-arg')
        #    fasttree_cmd.append(val)


        # Run GBLOCKS, capture output as it happens
        #
        self.log(console, 'RUNNING GBLOCKS:')
        self.log(console, '    '+' '.join(gblocks_cmd))
#        report += "\n"+'running GBLOCKS:'+"\n"
#        report += '    '+' '.join(gblocks_cmd)+"\n"

        # FastTree requires shell=True in order to see input data
        env = os.environ.copy()
        #joined_fasttree_cmd = ' '.join(fasttree_cmd)  # redirect out doesn't work with subprocess unless you join command first
        #p = subprocess.Popen([joined_fasttree_cmd], \
        p = subprocess.Popen(gblocks_cmd, \
                             cwd = self.scratch, \
                             stdin = subprocess.PIPE, \
                             stdout = subprocess.PIPE, \
                             stderr = subprocess.PIPE, \
                             shell = True, \
                             env = env)
#                             executable = '/bin/bash' )

        
        # write commands to process
        #
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks

        p.stdin.write("o"+"\n")  # open MSA file
        p.stdin.write(input_MSA_file_path+"\n")

        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            p.stdin.write("b"+"\n")
            if int(params['trim_level']) >= 1:
                self.log (console,"changing trim level")
                p.stdin.write("5"+"\n")  # set to "half"
                if int(params['trim_level']) == 2:
                    self.log (console,"changing trim level")
                    p.stdin.write("5"+"\n")  # set to "all"
                elif int(params['trim_level']) > 2:
                    raise ValueError ("trim_level ("+str(params['trim_level'])+") was not between 0-2")
                p.stdin.write("m"+"\n")

        # flank must precede conserved because it acts us upper bound for acceptable conserved values
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            self.log (console,"changing min_seqs_for_flank")
            p.stdin.write("b"+"\n")
            p.stdin.write("2"+"\n")
            p.stdin.write(str(params['min_seqs_for_flank'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            self.log (console,"changing min_seqs_for_conserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("1"+"\n")
            p.stdin.write(str(params['min_seqs_for_conserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) > -1:
            self.log (console,"changing max_pos_contig_nonconserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("3"+"\n")
            p.stdin.write(str(params['max_pos_contig_nonconserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_block_len' in params and params['min_block_len'] != None and params['min_block_len'] != 0:
            self.log (console,"changing min_block_len")
            p.stdin.write("b"+"\n")
            p.stdin.write("4"+"\n")
            p.stdin.write(str(params['min_block_len'])+"\n")
            p.stdin.write("m"+"\n")
        
        p.stdin.write("g"+"\n")  # get blocks
        p.stdin.write("q"+"\n")  # quit
        p.stdin.close()
        p.wait()


        # Read output
        #
        while True:
            line = p.stdout.readline()
            #line = p.stderr.readline()
            if not line: break
            self.log(console, line.replace('\n', ''))

        p.stdout.close()
        #p.stderr.close()
        p.wait()
        self.log(console, 'return code: ' + str(p.returncode))
#        if p.returncode != 0:
        if p.returncode != 1:
            raise ValueError('Error running GBLOCKS, return code: '+str(p.returncode) + 
                '\n\n'+ '\n'.join(console))

        # Check that GBLOCKS produced output
        #
        if not os.path.isfile(output_GBLOCKS_file_path):
            raise ValueError("failed to create GBLOCKS output: "+output_GBLOCKS_file_path)
        elif not os.path.getsize(output_GBLOCKS_file_path) > 0:
            raise ValueError("created empty file for GBLOCKS output: "+output_GBLOCKS_file_path)


        # load the method provenance from the context object
        #
        self.log(console,"SETTING PROVENANCE")  # DEBUG
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = []
        provenance[0]['input_ws_objects'].append(params['input_ref'])
        provenance[0]['service'] = 'kb_gblocks'
        provenance[0]['method'] = 'run_Gblocks'


        # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks
        #
        output_fasta_buf = []
        id_order = []
        this_id = None
        ids = dict()
        alignment = dict()
        L_alignment = 0;
        L_alignment_set = False
        with open(output_GBLOCKS_file_path,'r',0) as output_GBLOCKS_file_handle:
            for line in output_GBLOCKS_file_handle:
                line = line.rstrip()
                if line.startswith('>'):
                    this_id = line[1:]
                    output_fasta_buf.append ('>'+re.sub('\s','_',default_row_labels[this_id]))
                    id_order.append(this_id)
                    alignment[this_id] = ''
                    if L_alignment != 0 and not L_alignment_set:
                         L_alignment_set = True
                    continue
                output_fasta_buf.append (line)
                for c in line:
                    if c != ' ' and c != "\n":
                        alignment[this_id] += c
                        if not L_alignment_set:
                            L_alignment += 1
        if L_alignment == 0:
            self.log(invalid_msgs,"params produced no blocks.  Consider changing to less stringent values")
        else:
            if 'remove_mask_positions_flag' in params and params['remove_mask_positions_flag'] != None and params['remove_mask_positions_flag'] != '' and params['remove_mask_positions_flag'] == 1:
                self.log (console,"removing mask positions")
                mask = []
                new_alignment = dict()
                for i in range(0,L_alignment):
                    mask[i] = '+'
                    if alignment[id_order[0]][i] == '-' \
                        or alignment[id_order[0]][i] == 'X' \
                        or alignment[id_order[0]][i] == 'x':
                        mask[i] = '-'
                for row_id in id_order:
                    new_alignment[row_id] = ''
                    for i,c in enumerate(alignment[row_id]):
                         if mask[i] == '+':
                            new_alignment[row_id] += c
                alignment = new_alignment

            L_alignment = len(alignment[id_order[0]])

            # write fasta with tidied ids
            output_MSA_file_path = os.path.join(output_dir, params['output_name']+'.fasta');
            with open(output_MSA_file_path,'w',0) as output_MSA_file_handle:
                output_MSA_file_handle.write("\n".join(output_fasta_buf)+"\n")


        # Upload results
        #
        if len(invalid_msgs) == 0:
            self.log(console,"UPLOADING RESULTS")  # DEBUG

# Didn't write file
#            with open(output_MSA_file_path,'r',0) as output_MSA_file_handle:
#                output_MSA_buf = output_MSA_file_handle.read()
#            output_MSA_buf = output_MSA_buf.rstrip()
#            self.log(console,"\nMSA:\n"+output_MSA_buf+"\n")
        
            # Build output_MSA structure
            #   first extract old info from MSA (labels, ws_refs, etc.)
            #
            MSA_out = dict()
            for key in MSA_in.keys():
                 MSA_out[key] = MSA_in[key]

            # then replace with new info
            #
            MSA_out['alignment'] = alignment
            MSA_out['name'] = params['output_name']
            MSA_out['alignment_length'] = alignment_length = L_alignment
            MSA_name = params['output_name']
            MSA_description = ''
            if 'desc' in params and params['desc'] != None and params['desc'] != '':
                MSA_out['desc'] = MSA_description = params['desc']

            # Store MSA_out
            #
            new_obj_info = ws.save_objects({
                            'workspace': params['workspace_name'],
                            'objects':[{
                                    'type': 'KBaseTrees.MSA',
                                    'data': MSA_out,
                                    'name': params['output_name'],
                                    'meta': {},
                                    'provenance': provenance
                                }]
                        })[0]


            # create CLW formatted output file
            max_row_width = 60
            id_aln_gap_width = 1
            gap_chars = ''
            for sp_i in range(id_aln_gap_width):
                gap_chars += ' '
            # DNA
            if all_seqs_nuc:
                strong_groups = { 'AG': True,
                                  'CTU': True
                                  }
                weak_groups = None
            # PROTEINS
            else:
                strong_groups = { 'AST':  True,
                                  'EKNQ': True,
                                  'HKNQ': True,
                                  'DENQ': True,
                                  'HKQR': True,
                                  'ILMV': True,
                                  'FILM': True,
                                  'HY':   True,
                                  'FWY':  True
                                  }
                weak_groups = { 'ACS':    True,
                                'ATV':    True,
                                'AGS':    True,
                                'KNST':   True,
                                'APST':   True,
                                'DGNS':   True,
                                'DEKNQS': True,
                                'DEHKNQ': True,
                                'EHKNQR': True,
                                'FILMV':  True,
                                'FHY':    True
                                }
                
            clw_buf = []
            clw_buf.append ('CLUSTALW format of GBLOCKS trimmed MSA '+MSA_name+': '+MSA_description)
            clw_buf.append ('')

            long_id_len = 0
            aln_pos_by_id = dict()
            for row_id in row_order:
                aln_pos_by_id[row_id] = 0
                row_id_disp = default_row_labels[row_id]
                if long_id_len < len(row_id_disp):
                    long_id_len = len(row_id_disp)

            full_row_cnt = alignment_length // max_row_width
            if alignment_length % max_row_width == 0:
                full_row_cnt -= 1
            for chunk_i in range (full_row_cnt + 1):
                for row_id in row_order:
                    row_id_disp = re.sub('\s','_',default_row_labels[row_id])
                    for sp_i in range (long_id_len-len(row_id_disp)):
                        row_id_disp += ' '

                    aln_chunk_upper_bound = (chunk_i+1)*max_row_width
                    if aln_chunk_upper_bound > alignment_length:
                        aln_chunk_upper_bound = alignment_length
                    aln_chunk = alignment[row_id][chunk_i*max_row_width:aln_chunk_upper_bound]
                    for c in aln_chunk:
                        if c != '-':
                            aln_pos_by_id[row_id] += 1

                    clw_buf.append (row_id_disp+gap_chars+aln_chunk+' '+str(aln_pos_by_id[row_id]))

                # conservation line
                cons_line = ''
                for pos_i in range(chunk_i*max_row_width, aln_chunk_upper_bound):
                    col_chars = dict()
                    seq_cnt = 0
                    for row_id in row_order:
                        char = alignment[row_id][pos_i]
                        if char != '-':
                            seq_cnt += 1
                            col_chars[char] = True
                    if seq_cnt <= 1:
                        cons_char = ' '
                    elif len(col_chars.keys()) == 1:
                        cons_char = '*'
                    else:
                        strong = False
                        for strong_group in strong_groups.keys():
                            this_strong_group = True
                            for seen_char in col_chars.keys():
                                if seen_char not in strong_group:
                                    this_strong_group = False
                                    break
                            if this_strong_group:
                                strong = True
                                break
                        if not strong:
                            weak = False
                            if weak_groups != None:
                                for weak_group in weak_groups.keys():
                                    this_weak_group = True
                                    for seen_char in col_chars.keys():
                                        if seen_char not in weak_group:
                                            this_strong_group = False
                                            break
                                    if this_weak_group:
                                        weak = True
                        if strong:
                            cons_char = ':'
                        elif weak:
                            cons_char = '.'
                        else:
                            cons_char = ' '
                    cons_line += cons_char

                lead_space = ''
                for sp_i in range(long_id_len):
                    lead_space += ' '
                lead_space += gap_chars

                clw_buf.append(lead_space+cons_line)
                clw_buf.append('')

            # write clw to file
            clw_buf_str = "\n".join(clw_buf)+"\n"
            output_clw_file_path = os.path.join(output_dir, input_name+'-MSA.clw');
            with open (output_clw_file_path, "w", 0) as output_clw_file_handle:
                output_clw_file_handle.write(clw_buf_str)
            output_clw_file_handle.close()


            # upload GBLOCKS FASTA output to SHOCK for file_links
            dfu = DFUClient(self.callbackURL)
            try:
                output_upload_ret = dfu.file_to_shock({'file_path': output_aln_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                       'make_handle': 0})
            except:
                raise ValueError ('error loading aln_out file to shock')

            # upload GBLOCKS CLW output to SHOCK for file_links
            try:
                output_clw_upload_ret = dfu.file_to_shock({'file_path': output_clw_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                           'make_handle': 0})
            except:
                raise ValueError ('error loading clw_out file to shock')


            # make HTML reports
            #
            # HERE


            # build output report object
            #
            self.log(console,"BUILDING REPORT")  # DEBUG

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            reportObj = {
                'objects_created':[{'ref':params['workspace_name']+'/'+params['output_name'],
                                    'description':'GBLOCKS MSA'}],
                #'message': '',
                'message': clw_buf_str,
                'direct_html': '',
                #'direct_html_link_index': 0,
                'file_links': [],
                'html_links': [],
                'workspace_name': params['workspace_name'],
                'report_object_name': reportName
                }
            reportObj['file_links'] = [{'shock_id': output_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.FASTA',
                                        'label': 'GBLOCKS-trimmed MSA FASTA'
                                        },
                                       {'shock_id': output_clw_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.CLW',
                                        'label': 'GBLOCKS-trimmed MSA CLUSTALW'
                                        }]

            # save report object
            #
            SERVICE_VER = 'release'
            reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER)
            #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
            report_info = reportClient.create_extended_report(reportObj)                                       

        else:  # len(invalid_msgs) > 0
            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report += "FAILURE:\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            report_obj_info = ws.save_objects({
                    #'id':info[6],
                    'workspace':params['workspace_name'],
                    'objects':[
                        {
                            'type':'KBaseReport.Report',
                            'data':reportObj,
                            'name':reportName,
                            'meta':{},
                            'hidden':1,
                            'provenance':provenance
                            }
                        ]
                    })[0]

            report_info = dict()
            report_info['name'] = report_obj_info[1]
            report_info['ref'] = str(report_obj_info[6])+'/'+str(report_obj_info[0])+'/'+str(report_obj_info[4])


        # done
        returnVal = { 'report_name': report_info['name'],
                      'report_ref': report_info['ref']
                      }

        self.log(console,"run_Gblocks DONE")
        #END run_Gblocks

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method run_Gblocks return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
class FeatureSetBuilder:

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_upload_featureset_from_diff_expr_params(self, params):
        """
        _validate_upload_featureset_from_diff_expr_params:
                validates params passed to upload_featureset_from_diff_expr method
        """

        log('start validating upload_featureset_from_diff_expr params')

        # check for required parameters
        for p in ['diff_expression_ref', 'workspace_name',
                  'p_cutoff', 'q_cutoff', 'fold_change_cutoff']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        p = params.get('fold_scale_type')
        if p and p != 'logarithm':
            raise ValueError('"fold_scale_type" parameter must be set to "logarithm", if used')

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning("Unexpected parameter {} supplied".format(param))

    def _generate_report(self, up_feature_set_ref_list, down_feature_set_ref_list,
                         filtered_expression_matrix_ref_list, workspace_name):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_html_files = self._generate_html_report(up_feature_set_ref_list,
                                                       down_feature_set_ref_list)

        objects_created = list()
        for up_feature_set_ref in up_feature_set_ref_list:
            objects_created += [{'ref': up_feature_set_ref,
                                 'description': 'Upper FeatureSet Object'}]
        for down_feature_set_ref in down_feature_set_ref_list:
            objects_created += [{'ref': down_feature_set_ref,
                                 'description': 'Lower FeatureSet Object'}]

        for filtered_expression_matrix_ref in filtered_expression_matrix_ref_list:
            objects_created += [{'ref': filtered_expression_matrix_ref,
                                 'description': 'Filtered ExpressionMatrix Object'}]

        report_params = {'message': '',
                         'workspace_name': workspace_name,
                         'objects_created': objects_created,
                         'html_links': output_html_files,
                         'direct_html_link_index': 0,
                         'html_window_height': 333,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _generate_html_report(self, up_feature_set_ref_list, down_feature_set_ref_list):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        uppper_feature_content = ''
        for up_feature_set_ref in up_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     up_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            uppper_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                               len(feature_ids))

        lower_feature_content = ''
        for down_feature_set_ref in down_feature_set_ref_list:
            feature_set_obj = self.ws.get_objects2({'objects':
                                                    [{'ref':
                                                     down_feature_set_ref}]})['data'][0]
            feature_set_data = feature_set_obj['data']
            feature_set_info = feature_set_obj['info']

            feature_set_name = feature_set_info[1]

            elements = feature_set_data.get('elements')
            feature_ids = list(elements.keys())

            lower_feature_content += '<tr><td>{}</td><td>{}</td></tr>'.format(feature_set_name,
                                                                              len(feature_ids))

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<tr><td>Upper_FeatureSet</td></tr>',
                                                          uppper_feature_content)

                report_template = report_template.replace('<tr><td>Lower_FeatureSet</td></tr>',
                                                          lower_feature_content)

                result_file.write(report_template)

        html_report.append({'path': result_file_path,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report'})
        return html_report

    def _process_diff_expression(self, diff_expression_set_ref, result_directory,
                                 condition_label_pair):
        """
        _process_diff_expression: process differential expression object info
        """

        log('start processing differential expression object')

        diff_expr_set_data = self.ws.get_objects2({'objects':
                                                  [{'ref':
                                                   diff_expression_set_ref}]})['data'][0]['data']

        set_items = diff_expr_set_data['items']

        diff_expr_matrix_file_name = 'gene_results.csv'
        diff_expr_matrix_file = os.path.join(result_directory, diff_expr_matrix_file_name)

        with open(diff_expr_matrix_file, 'w') as csvfile:
            fieldnames = ['gene_id', 'log2_fold_change', 'p_value', 'q_value']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

        for set_item in set_items:
            diff_expression_ref = set_item['ref']

            diff_expression_data = self.ws.get_objects2({'objects':
                                                        [{'ref':
                                                         diff_expression_ref}]})['data'][0]['data']

            label_string = set_item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_1 = label_list[0]
            condition_2 = label_list[1]

            if condition_1 in condition_label_pair and condition_2 in condition_label_pair:
                genome_id = diff_expression_data['genome_ref']
                matrix_data = diff_expression_data['data']
                selected_diff_expression_ref = diff_expression_ref

                with open(diff_expr_matrix_file, 'a') as csvfile:
                    row_ids = matrix_data.get('row_ids')
                    row_values = matrix_data.get('values')
                    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                    for pos, row_id in enumerate(row_ids):
                        row_value = row_values[pos]
                        writer.writerow({'gene_id': row_id,
                                         'log2_fold_change': row_value[0],
                                         'p_value': row_value[1],
                                         'q_value': row_value[2]})

        return diff_expr_matrix_file, genome_id, selected_diff_expression_ref

    def _generate_feature_set(self, feature_ids, genome_id, workspace_name, feature_set_name):
        """
        _generate_feature_set: generate FeatureSet object

        KBaseCollections.FeatureSet type:
        typedef structure {
            string description;
            list<feature_id> element_ordering;
            mapping<feature_id, list<genome_ref>> elements;
        } FeatureSet;
        """

        log('start saving KBaseCollections.FeatureSet object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        elements = {feature_id: [genome_id] for feature_id in feature_ids}
        feature_set_data = {'description': 'Generated FeatureSet from DifferentialExpression',
                            'element_ordering': feature_ids,
                            'elements': elements}

        object_type = 'KBaseCollections.FeatureSet'
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': object_type,
                         'data': feature_set_data,
                         'name': feature_set_name}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return feature_set_obj_ref

    def _process_matrix_file(self, diff_expr_matrix_file, comp_p_value, comp_q_value,
                             comp_fold_change_cutoff):
        """
        _process_matrix_file: filter matrix file by given cutoffs
        """

        log('start processing matrix file')

        up_feature_ids = []
        down_feature_ids = []

        if comp_fold_change_cutoff < 0:
            comp_fold_change_cutoff = -comp_fold_change_cutoff

        with open(diff_expr_matrix_file, 'r') as file:
            reader = csv.DictReader(file)

            for row in reader:
                feature_id = row['gene_id']
                row_p_value = row['p_value']
                row_q_value = row['q_value']
                row_fold_change_cutoff = row['log2_fold_change']

                null_value = {'NA', 'null', ''}
                col_value = {row_p_value, row_q_value, row_fold_change_cutoff}

                if not col_value.intersection(null_value):
                    p_value_condition = float(row_p_value) <= comp_p_value
                    q_value_condition = float(row_q_value) <= comp_q_value

                    up_matches_condition = (p_value_condition and q_value_condition and
                                                         (float(row_fold_change_cutoff) >=
                                                         comp_fold_change_cutoff))

                    down_matches_condition = (p_value_condition and q_value_condition and
                                             (float(row_fold_change_cutoff) <=
                                             -comp_fold_change_cutoff))

                    if up_matches_condition:
                        up_feature_ids.append(feature_id)
                    elif down_matches_condition:
                        down_feature_ids.append(feature_id)

        return list(set(up_feature_ids)), list(set(down_feature_ids))

    def _filter_expression_matrix(self, expression_matrix_ref, feature_ids,
                                  workspace_name, filtered_expression_matrix_suffix="",
                                  diff_expression_matrix_ref=None,
                                  filtered_expression_matrix_name=None):
        """
        _filter_expression_matrix: generated filtered expression matrix
        """

        log('start saving ExpressionMatrix object')

        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_matrix_obj = self.dfu.get_objects({'object_refs':
                                                     [expression_matrix_ref]})['data'][0]

        expression_matrix_info = expression_matrix_obj['info']
        expression_matrix_data = expression_matrix_obj['data']

        expression_matrix_name = expression_matrix_info[1]

        if not filtered_expression_matrix_name:
            if re.match('.*_*[Ee]xpression_*[Mm]atrix', expression_matrix_name):
                filtered_expression_matrix_name = re.sub('_*[Ee]xpression_*[Mm]atrix',
                                                         filtered_expression_matrix_suffix,
                                                         expression_matrix_name)
            else:
                filtered_expression_matrix_name = expression_matrix_name + \
                    filtered_expression_matrix_suffix

        filtered_expression_matrix_data = expression_matrix_data.copy()

        data = filtered_expression_matrix_data['data']

        row_ids = data['row_ids']
        values = data['values']
        filtered_data = data.copy()

        filtered_row_ids = list()
        filtered_values = list()
        for pos, row_id in enumerate(row_ids):
            if row_id in feature_ids:
                filtered_row_ids.append(row_id)
                filtered_values.append(values[pos])

        filtered_data['row_ids'] = filtered_row_ids
        filtered_data['values'] = filtered_values
        filtered_expression_matrix_data['data'] = filtered_data

        expression_obj = {'type': expression_matrix_info[2], 'data': filtered_expression_matrix_data,
                          'name': filtered_expression_matrix_name}
        # we now save the filtering DEM in a EM field added for this purpose
        if diff_expression_matrix_ref:
            expression_obj['data']['diff_expr_matrix_ref'] = diff_expression_matrix_ref
            expression_obj['extra_provenance_input_refs'] = [diff_expression_matrix_ref]

        save_object_params = {
            'id': workspace_id,
            'objects': [expression_obj]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        filtered_expression_matrix_ref = "{}/{}/{}".format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        return filtered_expression_matrix_ref

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _check_input_labels(self, condition_pairs, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        for condition_pair in condition_pairs:

            label_string = condition_pair['label_string'][0].strip()
            label_list = [x.strip() for x in label_string.split(',')]
            first_label = label_list[0]
            second_label = label_list[1]

            if first_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(first_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if second_label not in available_condition_labels:
                error_msg = 'Condition: {} is not availalbe. '.format(second_label)
                error_msg += 'Available conditions: {}'.format(available_condition_labels)
                raise ValueError(error_msg)

            if first_label == second_label:
                raise ValueError('Input conditions are the same')

        return checked

    def _get_condition_labels(self, diff_expression_set_ref):
        """
        _get_condition_labels: get all possible condition label pairs
        """
        log('getting all possible condition pairs')

        condition_label_pairs = list()
        available_condition_labels = set()
        diff_expression_set_obj = self.ws.get_objects2({'objects':
                                                       [{'ref': diff_expression_set_ref}]
                                                        })['data'][0]
        diff_expression_set_data = diff_expression_set_obj['data']
        items = diff_expression_set_data.get('items')
        for item in items:
            label_string = item['label']
            label_list = [x.strip() for x in label_string.split(',')]
            condition_label_pairs.append(label_list)
            available_condition_labels |= set(label_list)

        log('all possible condition pairs:\n{}'.format(condition_label_pairs))

        return condition_label_pairs, available_condition_labels

    def _get_feature_ids(self, genome_ref, ids):
        """
        _get_feature_ids: get feature ids from genome
        """

        genome_features = self.gsu.search({'ref': genome_ref,
                                           'limit': len(ids),
                                           'structured_query': {"$or": [{"feature_id": x}
                                                                        for x in ids]},
                                           'sort_by': [['feature_id', True]]})['features']

        features_ids = set((feature.get('feature_id') for feature in genome_features))

        return features_ids

    def _build_fs_obj(self, params):
        new_feature_set = {
            'description': '',
            'element_ordering': [],
            'elements': {}
        }
        genome_ref = params['genome']
        if params.get('base_feature_sets', []) and None not in params['base_feature_sets']:
            base_feature_sets = self.dfu.get_objects(
                {'object_refs': params['base_feature_sets']}
            )['data']
            for ret in base_feature_sets:
                base_set = ret['data']
                base_set_name = ret['info'][1]

                new_feature_set['element_ordering'] += [x for x in base_set['element_ordering']
                                                        if x not in new_feature_set['elements']]
                for element, genome_refs in base_set['elements'].items():
                    if element in new_feature_set['elements']:
                        new_feature_set['elements'][element] += [x for x in genome_refs if x not in
                                                                 new_feature_set['elements'][
                                                                     element]]
                    else:
                        new_feature_set['elements'][element] = genome_refs
                new_feature_set['description'] += 'From FeatureSet {}: {}\n'.format(
                    base_set_name, base_set.get('description'))
        new_feature_ids = []
        if params.get('feature_ids'):
            if isinstance(params['feature_ids'], str):
                new_feature_ids += params['feature_ids'].split(',')
            else:
                new_feature_ids += params['feature_ids']
        if params.get('feature_ids_custom'):
            new_feature_ids += params['feature_ids_custom'].split(',')
        if new_feature_ids:
            genome_feature_ids = self._get_feature_ids(genome_ref, new_feature_ids)
        for new_feature in new_feature_ids:
            if new_feature not in genome_feature_ids:
                raise ValueError('Feature ID {} does not exist in the supplied genome {}'.format(
                    new_feature, genome_ref))
            if new_feature in new_feature_set['elements']:
                if genome_ref not in new_feature_set['elements'][new_feature]:
                    new_feature_set['elements'][new_feature].append(genome_ref)
            else:
                new_feature_set['elements'][new_feature] = [genome_ref]
                new_feature_set['element_ordering'].append(new_feature)

        if params.get('description'):
            new_feature_set['description'] = params['description']

        return new_feature_set

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.ws = Workspace(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.scratch = config['scratch']

    def upload_featureset_from_diff_expr(self, params):
        """
        upload_featureset_from_diff_expr: create FeatureSet from RNASeqDifferentialExpression
                                          based on given threshold cutoffs

        required params:
        diff_expression_ref: DifferetialExpressionMatrixSet object reference
        expression_matrix_ref: ExpressionMatrix object reference
        p_cutoff: p value cutoff
        q_cutoff: q value cutoff
        fold_scale_type: one of ["linear", "log2+1", "log10+1"]
        fold_change_cutoff: fold change cutoff
        feature_set_suffix: Result FeatureSet object name suffix
        filtered_expression_matrix_suffix: Result ExpressionMatrix object name suffix
        workspace_name: the name of the workspace it gets saved to

        return:
        result_directory: folder path that holds all files generated
        up_feature_set_ref_list: list of generated upper FeatureSet object reference
        down_feature_set_ref_list: list of generated down FeatureSet object reference
        filtered_expression_matrix_ref_list: list of generated filtered ExpressionMatrix object ref
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        self._validate_upload_featureset_from_diff_expr_params(params)

        diff_expression_set_ref = params.get('diff_expression_ref')
        diff_expression_set_info = self.ws.get_object_info3({"objects":
                                                            [{"ref": diff_expression_set_ref}]}
                                                            )['infos'][0]
        diff_expression_set_name = diff_expression_set_info[1]

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        (available_condition_label_pairs,
         available_condition_labels) = self._get_condition_labels(diff_expression_set_ref)

        run_all_combinations = params.get('run_all_combinations')
        condition_pairs = params.get('condition_pairs')
        if not self._xor(run_all_combinations, condition_pairs):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide partial condition pairs. Don't do both or neither"
            raise ValueError(error_msg)

        if run_all_combinations:
            condition_label_pairs = available_condition_label_pairs
        else:
            if self._check_input_labels(condition_pairs, available_condition_labels):
                condition_label_pairs = list()
                for condition_pair in condition_pairs:
                    label_string = condition_pair['label_string'][0].strip()
                    condition_labels = [x.strip() for x in label_string.split(',')]
                    condition_label_pairs.append(condition_labels)

        up_feature_set_ref_list = list()
        down_feature_set_ref_list = list()
        filtered_expression_matrix_ref_list = list()

        for condition_label_pair in condition_label_pairs:
            condition_string = '-'.join(reversed(condition_label_pair))
            diff_expr_matrix_file, genome_id, diff_expr_matrix_ref = self._process_diff_expression(
                                                                diff_expression_set_ref,
                                                                result_directory,
                                                                condition_label_pair)
            up_feature_ids, down_feature_ids = self._process_matrix_file(
                                                                diff_expr_matrix_file,
                                                                params.get('p_cutoff'),
                                                                params.get('q_cutoff'),
                                                                params.get('fold_change_cutoff'))
            filtered_em_name = _sanitize_name(condition_string) + params.get('filtered_expression_matrix_suffix')
            if params.get('expression_matrix_ref'):
                filtered_expression_matrix_ref = self._filter_expression_matrix(
                                                params.get('expression_matrix_ref'),
                                                up_feature_ids + down_feature_ids,
                                                params.get('workspace_name'), "",
                                                diff_expr_matrix_ref, filtered_em_name)
                filtered_expression_matrix_ref_list.append(filtered_expression_matrix_ref)

            feature_set_suffix = params.get('feature_set_suffix', "")
            up_feature_set_name = "{}_{}_up{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            up_feature_set_ref = self._generate_feature_set(up_feature_ids,
                                                            genome_id,
                                                            params.get('workspace_name'),
                                                            up_feature_set_name)
            up_feature_set_ref_list.append(up_feature_set_ref)

            down_feature_set_name = "{}_{}_down{}".format(
                diff_expression_set_name, _sanitize_name(condition_string), feature_set_suffix)
            down_feature_set_ref = self._generate_feature_set(down_feature_ids,
                                                              genome_id,
                                                              params.get('workspace_name'),
                                                              down_feature_set_name)
            down_feature_set_ref_list.append(down_feature_set_ref)

        returnVal = {'result_directory': result_directory,
                     'up_feature_set_ref_list': up_feature_set_ref_list,
                     'down_feature_set_ref_list': down_feature_set_ref_list,
                     'filtered_expression_matrix_ref_list': filtered_expression_matrix_ref_list}

        report_output = self._generate_report(up_feature_set_ref_list, down_feature_set_ref_list,
                                              filtered_expression_matrix_ref_list,
                                              params.get('workspace_name'))
        returnVal.update(report_output)

        return returnVal

    def filter_matrix_with_fs(self, params):
        self.validate_params(params, ('feature_set_ref', 'workspace_name',
                                      'expression_matrix_ref', 'filtered_expression_matrix_suffix'))
        ret = self.dfu.get_objects(
            {'object_refs': [params['feature_set_ref']]}
        )['data'][0]
        feature_set = ret['data']
        feature_set_name = ret['info'][1]
        feature_ids = set(feature_set['elements'].keys())
        filtered_matrix_ref = self._filter_expression_matrix(
            params['expression_matrix_ref'], feature_ids, params['workspace_name'],
            params['filtered_expression_matrix_suffix'])

        objects_created = [{'ref': filtered_matrix_ref,
                            'description': 'Filtered ExpressionMatrix Object'}]
        message = "Filtered Expression Matrix based of the {} feature ids present in {}"\
            .format(len(feature_ids), feature_set_name)

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'filtered_expression_matrix_ref': filtered_matrix_ref,
                'report_name': output['name'], 'report_ref': output['ref']}

    def build_feature_set(self, params):
        self.validate_params(params, {'output_feature_set', 'workspace_name', },
                             {'genome', 'feature_ids', 'feature_ids_custom', 'base_feature_sets',
                              'description'})
        feature_sources = ('feature_ids', 'feature_ids_custom', 'base_feature_sets')
        if not any([params.get(x) for x in feature_sources]):
            raise ValueError("You must supply at least one feature source: {}".format(
                ", ".join(feature_sources)))
        workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])

        new_feature_set = self._build_fs_obj(params)
        save_object_params = {
            'id': workspace_id,
            'objects': [{'type': 'KBaseCollections.FeatureSet',
                         'data': new_feature_set,
                         'name': params['output_feature_set']}]}

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        feature_set_obj_ref = '{}/{}/{}'.format(dfu_oi[6], dfu_oi[0], dfu_oi[4])

        objects_created = [{'ref': feature_set_obj_ref,
                            'description': 'Feature Set'}]
        message = 'A new feature set containing {} features was created.'.format(
            len(new_feature_set['elements']))

        report_params = {'message': message,
                         'workspace_name': params['workspace_name'],
                         'objects_created': objects_created,
                         'report_object_name': 'kb_FeatureSetUtils_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        return {'feature_set_ref': feature_set_obj_ref,
                'report_name': output['name'], 'report_ref': output['ref']}
Beispiel #35
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(sorted(feat['aliases'].keys())),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
Beispiel #36
0
 def __init__(self, config):
     self.cfg = config
     self.scratch = config['scratch']
     self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
     self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     self.ws = Workspace(config["workspace-url"])