Beispiel #1
0
def download_workspace_data(ws_url, source_ws, source_obj, working_dir,
                            logger):
    ws = Workspace(ws_url, token=TOKEN)
    objdata = ws.get_objects([{'ref': source_ws + '/' + source_obj}])[0]
    info = objdata['info']
    if info[2].split('-')[0] != 'KBaseFile.AssemblyFile':
        raise ValueError(
            'This method only works on the KBaseFile.AssemblyFile type')
    shock_url = objdata['data']['assembly_file']['file']['url']
    shock_id = objdata['data']['assembly_file']['file']['id']
    ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
    source = objdata['data'].get('source')

    outfile = os.path.join(working_dir, source_obj)
    shock_node = shock_url + '/node/' + shock_id + '/?download'
    headers = {'Authorization': 'OAuth ' + TOKEN}
    with open(outfile, 'w') as f:
        response = requests.get(shock_node, stream=True, headers=headers)
        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                logger.error("Couldn't parse response error content: " +
                             response.content)
                response.raise_for_status()
            raise Exception(str(err))
        for block in response.iter_content(1024):
            if not block:
                break
            f.write(block)

    return shock_url, shock_id, ref, source
Beispiel #2
0
def get_object_uid(name):
    WS_URL = 'https://ci.kbase.us/services/ws/'
    from biokbase.workspace.client import Workspace
    ws = Workspace(WS_URL)
    info = ws.get_objects(
        [dict(workspace=os.environ['KB_WORKSPACE_ID'], name=name)])[0]['info']
    return '%s/%s/%s' % (info[6], info[0], info[4])
    def TophatCall(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        # BEGIN TophatCall
        ws_client = Workspace(url=self.__WS_URL, token=user_token)
        hs = HandleService(url=self.__HS_URL, token=user_token)
        try:
            ### Make a function to download the workspace object  and prepare dict of genome ,lib_type

            self.__LOGGER.info("Downloading RNASeq Sample file")
            try:
                ret = ws_client.get_objects(
                    [
                        {"name": params["sample_id"], "workspace": params["ws_id"]},
                        {"name": params["reference"], "workspace": params["ws_id"]},
                        {"name": params["bowtie_index"], "workspace": params["ws_id"]},
                        {"name": params["annotation_gtf"], "workspace": params["ws_id"]},
                    ]
                )
            except Exception, e:
                raise KBaseRNASeqException("Error Downloading objects from the workspace ")

                # Download reads from the JSON object
            genome = params["reference"]
            if "data" in reads:
                # if 'metadata' in reads['data']:
                # genome = reads['data']['metadata']['ref_genome']
                if "singleend_sample" in reads["data"]:
                    lib_type = "SingleEnd"
                    # cmdstring =
                elif "pairedend_sample" in reads["data"]:
                    lib_type = "PairedEnd"
Beispiel #4
0
def get_object_uid(name):
    WS_URL = "https://ci.kbase.us/services/ws/"
    from biokbase.workspace.client import Workspace

    ws = Workspace(WS_URL)
    info = ws.get_objects([dict(workspace=os.environ["KB_WORKSPACE_ID"], name=name)])[0]["info"]
    return "%s/%s/%s" % (info[6], info[0], info[4])
Beispiel #5
0
def get_object_from_ref(ref):
    objid = int(ref.split("/")[1])
    WS_URL = "https://ci.kbase.us/services/ws/"
    from biokbase.workspace.client import Workspace

    ws = Workspace(WS_URL)
    return ws.get_objects([dict(workspace=os.environ["KB_WORKSPACE_ID"], objid=objid)])[0]["data"]
def download_workspace_data(ws_url, source_ws, source_obj, working_dir,
                            logger):
    ws = Workspace(ws_url, token=TOKEN)
    objdata = ws.get_objects([{'ref': source_ws + '/' + source_obj}])[0]
    info = objdata['info']
    if info[2].split('-')[0] != 'KBaseFile.AssemblyFile':
        raise ValueError(
            'This method only works on the KBaseFile.AssemblyFile type')
    shock_url = objdata['data']['assembly_file']['file']['url']
    shock_id = objdata['data']['assembly_file']['file']['id']
    ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
    source = objdata['data'].get('source')

    outfile = os.path.join(working_dir, source_obj)
    shock_node = shock_url + '/node/' + shock_id + '/?download'
    headers = {'Authorization': 'OAuth ' + TOKEN}
    with open(outfile, 'w') as f:
        response = requests.get(shock_node, stream=True, headers=headers)
        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                logger.error("Couldn't parse response error content: " +
                             response.content)
                response.raise_for_status()
            raise Exception(str(err))
        for block in response.iter_content(1024):
            if not block:
                break
            f.write(block)

    return shock_url, shock_id, ref, source
Beispiel #7
0
    def get_probanno(self, ctx, input):
        # ctx is the context object
        # return variables are: output
        #BEGIN get_probanno
        ''' Convert a probabilistic annotation object into a human-readbable table.

            @param ctx Current context object
            @param input Dictionary with input parameters for function
            @return Dictionary keyed by gene to a list of tuples with roleset and likelihood
            @raise WrongVersionError when ProbAnno object version number is invalid
        '''

        input = self._checkInputArguments(ctx, input,
                                          ['probanno', 'probanno_workspace'],
                                          { 'probanno_version': None }
                                          )

        wsClient = Workspace(self.config["workspace_url"], token=ctx['token'])
        probAnnoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"], input['probanno_version'])
        objectList = wsClient.get_objects( [ probAnnoObjectId ] )
        probAnnoObject = objectList[0]
        if probAnnoObject['info'][2] != ProbAnnoType:
            message = 'ProbAnno object type %s is not %s for object %s' %(probAnnoObject['info'][2], ProbAnnoType, probAnnoObject['info'][1])
            ctx.log_err(message)
            raise WrongVersionError(message)
        output = probAnnoObject["data"]["roleset_probabilities"]

        #END get_probanno

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method get_probanno return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
 def associateReads(self, ctx, params):
     # ctx is the context object
     # return variables are: returnVal
     # BEGIN associateReads
     user_token = ctx["token"]
     ws_client = Workspace(url=self.__WS_URL, token=user_token)
     out = dict()
     out["metadata"] = {
         k: v
         for k, v in params.iteritems()
         if not k in ("ws_id", "analysis_id", "genome_id", "singleend_sample", "pairedend_sample") and v is not None
     }
     self.__LOGGER.info("Uploading RNASeqSample {0}".format(out["metadata"]["sample_id"]))
     if "genome_id" in params and params["genome_id"] is not None:
         out["metadata"]["genome_id"] = script_util.get_obj_info(
             self.__LOGGER, self.__WS_URL, [params["genome_id"]], params["ws_id"], user_token
         )[0]
     if "analysis_id" in params and params["analysis_id"] is not None:
         g_ref = script_util.get_obj_info(
             self.__LOGGER, self.__WS_URL, [params["analysis_id"]], params["ws_id"], user_token
         )[0]
         out["analysis_id"] = g_ref
     if "singleend_sample" in params and params["singleend_sample"] is not None:
         try:
             s_res = ws_client.get_objects([{"name": params["singleend_sample"], "workspace": params["ws_id"]}])
             out["singleend_sample"] = s_res[0]["data"]
             print out["singleend_sample"]
         except Exception, e:
             raise KBaseRNASeqException(
                 "Error Downloading SingleEndlibrary object from the workspace {0},{1}".format(
                     params["singleend_sample"], e
                 )
             )
Beispiel #9
0
def get_object_from_ref(ref):
    objid = int(ref.split('/')[1])
    WS_URL = 'https://ci.kbase.us/services/ws/'
    from biokbase.workspace.client import Workspace
    ws = Workspace(WS_URL)
    return ws.get_objects(
        [dict(workspace=os.environ['KB_WORKSPACE_ID'],
              objid=objid)])[0]['data']
    def test_annotate(self):
        ''' Run pa-annotate on a valid Genome object and verify that the job runs and returns a valid ProbAnno object in the expected time.'''

        # Run the annotate() function to generate a ProbAnno object.
        paClient = ProbabilisticAnnotation(self._config["probanno_url"],
                                           token=self._token)
        jobid = paClient.annotate({
            "genome": self._config["genomeid"],
            "genome_workspace": self._config["test_ws"],
            "probanno": self._config["probannoid"],
            "probanno_workspace": self._config["test_ws"]
        })

        # Allow time for the command to run.
        time.sleep(float(self._config["runtime"]))

        # Make sure the job has completed.
        ujsClient = UserAndJobState(self._config['ujs_url'], token=self._token)
        jobList = ujsClient.list_jobs([self._config['test_user']], 'CE')
        jobCompleted = False
        for job in jobList:
            if jobid == job[0]:
                jobCompleted = True
                jobInfo = job
        self.assertTrue(
            jobCompleted, 'Job did not complete before timeout of %s seconds' %
            (self._config['runtime']))

        # See if the job ended in error.
        details = ''
        if jobInfo[11] == 1:
            details = ujsClient.get_detailed_error(jobInfo[0])
        self.assertEqual(jobInfo[11], 0, 'Job ended in error: %s' % (details))

        # Look for the ProbAnno object in the test workspace.
        wsClient = Workspace(self._config["workspace_url"], token=self._token)
        try:
            probannoObjectId = {
                'workspace': self._config['test_ws'],
                'name': self._config['probannoid']
            }
            objectList = wsClient.get_objects([probannoObjectId])
            probannoObject = objectList[0]
            self.assertEqual(
                probannoObject['info'][1], self._config['probannoid'],
                'ProbAnno object id %s is not %s' %
                (probannoObject['info'][1], self._config['probannoid']))
        except WorkspaceServerError as e:
            traceback.print_exc(file=sys.stderr)
            self.fail(
                msg=
                "The expected object %s did not get created in the workspace %s!\n"
                % (self._config["probannoid"], self._config["test_ws"]))
def fetch_narrative(nar_id, auth_token, url=ci_ws, file_name=None):
    """
    Fetches a Narrative object with the given reference id (of the form ##/##).
    If a file_name is given, then it is printed to that file.
    If the narrative is found, the jsonized string of it is returned.

    If nothing is found, an empty Dict is returned.
    """
    ws_client = Workspace(url=url, token=auth_token)
    nar_data = ws_client.get_objects([{'ref':nar_id}])
    if len(nar_data) > 0:
        nar_json = json.dumps(nar_data[0])
        if file_name is not None:
            f = open(file_name, 'w')
            f.write(nar_json)
            f.close()
        return nar_json
    return {}
Beispiel #12
0
def fetch_narrative(nar_id, auth_token, url=ci_ws, file_name=None):
    """
    Fetches a Narrative object with the given reference id (of the form ##/##).
    If a file_name is given, then it is printed to that file.
    If the narrative is found, the jsonized string of it is returned.

    If nothing is found, an empty Dict is returned.
    """
    ws_client = Workspace(url=url, token=auth_token)
    nar_data = ws_client.get_objects([{"ref": nar_id}])
    if len(nar_data) > 0:
        nar_json = json.dumps(nar_data[0])
        if file_name is not None:
            f = open(file_name, "w")
            f.write(nar_json)
            f.close()
        return nar_json
    return {}
    def test_annotate(self):
        ''' Run pa-annotate on a valid Genome object and verify that the job runs and returns a valid ProbAnno object in the expected time.'''

        # Run the annotate() function to generate a ProbAnno object.
        paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token)
        jobid = paClient.annotate( {
            "genome": self._config["genomeid"],
            "genome_workspace": self._config["test_ws"],
            "probanno": self._config["probannoid"],
            "probanno_workspace": self._config["test_ws"] } )
        
        # Allow time for the command to run.
        time.sleep(float(self._config["runtime"]))
        
        # Make sure the job has completed.
        ujsClient = UserAndJobState(self._config['ujs_url'], token=self._token)
        jobList = ujsClient.list_jobs([ self._config['test_user'] ], 'CE')
        jobCompleted = False
        for job in jobList:
            if jobid == job[0]:
                jobCompleted = True
                jobInfo = job
        self.assertTrue(jobCompleted, 'Job did not complete before timeout of %s seconds' %(self._config['runtime']))
        
        # See if the job ended in error.
        details = ''
        if jobInfo[11] == 1:
            details = ujsClient.get_detailed_error(jobInfo[0])
        self.assertEqual(jobInfo[11], 0, 'Job ended in error: %s' %(details))

        # Look for the ProbAnno object in the test workspace.
        wsClient = Workspace(self._config["workspace_url"], token=self._token)
        try:
            probannoObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['probannoid'] }
            objectList = wsClient.get_objects( [ probannoObjectId ] )
            probannoObject = objectList[0]
            self.assertEqual(probannoObject['info'][1], self._config['probannoid'], 'ProbAnno object id %s is not %s' %(probannoObject['info'][1], self._config['probannoid']))
        except WorkspaceServerError as e:
            traceback.print_exc(file=sys.stderr)
            self.fail(msg = "The expected object %s did not get created in the workspace %s!\n" %(self._config["probannoid"], self._config["test_ws"]))
 def test_calculate(self):
     ''' Run pa-calculate on a valid ProbAnno object and verify that the job runs and returns a valid RxnProbs object.'''
     
     # Run the calculate() function to generate a RxnProbs object.
     paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token)
     rxnprobsMetadata = paClient.calculate( {
         "probanno":           self._config["probannoid"],
         "probanno_workspace": self._config["test_ws"],
         "rxnprobs":           self._config["rxnprobsid"],
         "rxnprobs_workspace": self._config["test_ws"] 
         } )
      
     # Look for the RxnProbs object in the test workspace.
     wsClient = Workspace(self._config["workspace_url"], token=self._token)
     try:
         rxnprobsObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['rxnprobsid'] }
         objectList = wsClient.get_objects( [ rxnprobsObjectId ] )
         rxnprobsObject = objectList[0]
         self.assertEqual(rxnprobsObject['info'][1], self._config['rxnprobsid'], 'RxnProbs object id %s is not %s' %(rxnprobsObject['info'][1], self._config['rxnprobsid']))
     except WorkspaceServerError as e:
         traceback.print_exc(file=sys.stderr)
         self.fail(msg = "The expected object %s did not get created in the workspace %s!\n" %(self._config["rxnprobsid"], self._config["test_ws"]))
Beispiel #15
0
    def get_rxnprobs(self, ctx, input):
        # ctx is the context object
        # return variables are: output
        #BEGIN get_rxnprobs
        ''' Convert a reaction probability object into a human-readable table.

            @param ctx Current context object
            @param input Dictionary with input parameters for function
            @return List of reaction_probability tuples
            @raise WrongVersionError when RxnProbs object version number is invalid
        '''

        # Sanity check on input arguments
        input = self._checkInputArguments(ctx, input, 
                                          [ "rxnprobs", "rxnprobs_workspace" ], 
                                          { 'rxnprobs_version': None, 'sort_field': 'rxnid' }
                                          )

        wsClient = Workspace(self.config["workspace_url"], token=ctx['token'])
        rxnProbsObjectId = make_object_identity(input["rxnprobs_workspace"], input["rxnprobs"], input['rxnprobs_version'])
        objectList = wsClient.get_objects( [ rxnProbsObjectId ] )
        rxnProbsObject = objectList[0]
        if rxnProbsObject['info'][2] != RxnProbsType:
            message = 'RxnProbs object type %s is not %s for object %s' %(rxnProbsObject['info'][2], RxnProbsType, rxnProbsObject['info'][1])
            ctx.log_err(message)
            raise WrongVersionError(message)
        output = rxnProbsObject["data"]["reaction_probabilities"]
        if input['sort_field'] == 'rxnid':
            output.sort(key=lambda tup: tup[0])
        elif input['sort_field'] == 'probability':
            output.sort(key=lambda tup: tup[1], reverse=True)
        #END get_rxnprobs

        # At some point might do deeper type checking...
        if not isinstance(output, list):
            raise ValueError('Method get_rxnprobs return value ' +
                             'output is not type list as required.')
        # return the results
        return [output]
    def test_calculate(self):
        ''' Run pa-calculate on a valid ProbAnno object and verify that the job runs and returns a valid RxnProbs object.'''

        # Run the calculate() function to generate a RxnProbs object.
        paClient = ProbabilisticAnnotation(self._config["probanno_url"],
                                           token=self._token)
        rxnprobsMetadata = paClient.calculate({
            "probanno":
            self._config["probannoid"],
            "probanno_workspace":
            self._config["test_ws"],
            "rxnprobs":
            self._config["rxnprobsid"],
            "rxnprobs_workspace":
            self._config["test_ws"]
        })

        # Look for the RxnProbs object in the test workspace.
        wsClient = Workspace(self._config["workspace_url"], token=self._token)
        try:
            rxnprobsObjectId = {
                'workspace': self._config['test_ws'],
                'name': self._config['rxnprobsid']
            }
            objectList = wsClient.get_objects([rxnprobsObjectId])
            rxnprobsObject = objectList[0]
            self.assertEqual(
                rxnprobsObject['info'][1], self._config['rxnprobsid'],
                'RxnProbs object id %s is not %s' %
                (rxnprobsObject['info'][1], self._config['rxnprobsid']))
        except WorkspaceServerError as e:
            traceback.print_exc(file=sys.stderr)
            self.fail(
                msg=
                "The expected object %s did not get created in the workspace %s!\n"
                % (self._config["rxnprobsid"], self._config["test_ws"]))
    def diff_p_distribution(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN diff_p_distribution
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN]
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## loading pvalue distribution FDT
        pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        pvfdt = OrderedDict(pvfdt)
        with open(self.PVFDT_FN, 'r') as myfile:
           pvfdt = json.load(myfile)
        data_obj_name = "{0}.fdt".format(param['out_figure_object_name'])
        pvfdt['id'] = data_obj_name
 
 
        fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"}
        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : pvfdt,
                                                                              'name' : data_obj_name}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END diff_p_distribution

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method diff_p_distribution return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def const_coex_net_clust(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN const_coex_net_clust
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.CLSTR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']

        param = args
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
            #raise Exception(stderr)
 
        self.logger.info("Coexpression clustering analysis")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_cluster
        cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y',
                           '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 
                           '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ]
 
        for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']:
           if p in param:
             cmd_coex_cluster.append("--{0}".format(p))
             cmd_coex_cluster.append(str(param[p]))
  
 
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr):
              self.logger.info(stderr)
            else:
              self.logger.error(stderr)
              raise Exception(stderr)
 
        
        # build index for gene list
        pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))}
 
 
        # parse clustering results
        cid2genelist = {}
        cid2stat = {}
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                cluster, mcor, msec = line.rstrip().replace('"','').split("\t")
                cid2stat[cluster]= [mcor, msec]
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                gene, cluster = line.rstrip().replace('"','').split("\t")
                if cluster not in cid2genelist:
                    cid2genelist[cluster] = []
                cid2genelist[cluster].append(gene)
 
        if(len(cid2genelist) < 1) :
          self.logger.error("Clustering failed")
          return empty_results("Error: No cluster output", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(4)
 
        self.logger.info("Uploading the results onto WS")
        feature_clusters = []
        for cluster in cid2genelist:
            feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}})

        ## Upload Clusters
        feature_clusters ={"original_data": "{0}/{1}".format(param['workspace_name'],param['object_name']),
                           "feature_clusters": feature_clusters}
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters',
                                                                          'data' : feature_clusters,
                                                                          'name' : (param['out_object_name'])}]})
        result = {'workspace_name' : param['workspace_name'], 'out_object_name' : param['out_object_name']}
        #END const_coex_net_clust

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method const_coex_net_clust return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Beispiel #19
0
def run_filter_genes(workspace_service_url=None, param_file = None, level=logging.INFO, logger = None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """ 

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
      param = json.load(paramh)

    cmd_dowload_cvt_tsv = [FVE_2_TSV, '--workspace_service_url', workspace_service_url, 
                                      '--workspace_name', param['workspace_name'],
                                      '--object_name', param['object_name'],
                                      '--working_directory', RAWEXPR_DIR,
                                      '--output_file_name', EXPRESS_FN
                          ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
      fl = f.readline()
    ncol = len(fl.split('\t'))
    
    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
      s.write("0")
      for j in range(1,ncol-1):
        s.write("\t{0}".format(j))
      s.write("\n")


    ## Run coex_filter
    cmd_coex_filter = [COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN),
                       '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN),
                       '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y']
    if 'num_features' in param:
      cmd_coex_filter.append("-n")
      cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
      cmd_coex_filter.append("-p")
      cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
      logger.error("One of p_value or num_features must be defined");
      sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(fl) # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)
    

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
    
    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [TSV_2_FVE, '--workspace_service_url', workspace_service_url, 
                                      '--object_name', param['out_expr_object_name'],
                                      '--working_directory', FINAL_DIR,
                                      '--input_directory', FLTRD_DIR,
                                      '--output_file_name', FINAL_FN
                          ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    
    with open("{0}/{1}".format(FINAL_DIR,FINAL_FN),'r') as et:
      eo = json.load(et)

    if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format(expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                          'data' : expr,
                                                                          'name' : (param['out_expr_object_name'])}]})

    ## Upload FeatureSet
    fs ={'description':'Differentially expressed genes generated by {0}'.format(" ".join(cmd_coex_filter)),
         'elements': {}}
    
    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN),'r') as glh:
      gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
      if 'genome_ref' in expr:
        fs['elements'][g] = [expr['genome_ref']]
      else:
        fs['elements'][g] = []

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                          'data' : fs,
                                                                          'name' : (param['out_fs_object_name'])}]})
    def filter_BlastOutput(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_BlastOutput
        user_token=ctx['token']
        ws_client=Workspace(url=self.__WS_URL, token=user_token)
        blast_outputs=ws_client.get_objects([{'name':params['in_id'], 
                                              'workspace': params['ws_id']}])

            

        fs ={'elements': {}}
        fs['description'] = "FeatureSet from BlastOutput by "
        printedEvalue = False
        printedEntries = False
        if 'evalue' in params and params['evalue'] != "":
            fs['description'] += " E-value:{0}".format(params['evalue'])
            printedEvalue = True
        if 'entries' in params and (params['entries'] != "" or params['entries'] > 0):
            if(printedEvalue): fs['description'] += ","
            fs['description'] += " # of entries :{0}".format(params['entries'])
            printedEntries = True
        if not printedEvalue and not printedEntries:
            fs['description'] += "no filtering"
        
        if len(blast_outputs) != 1:
            fs['description'] = "No such blast output object was found : {0}/{1}".format(param['workspace_name'], param['object_name'])
        else:
            fm = {}
            f2g = {}
            for boid in blast_outputs[0]['data']['BlastOutput_iterations']['Iteration']:
                for hitd in boid['Iteration_hits']['Hit']:
                    print hitd['Hit_def']
                    ali = hitd['Hit_def'].find('#')
                    if(ali < 0): next
                    fid = hitd['Hit_def'][0:ali]
                    gri = hitd['Hit_def'].find('#', ali+1)
                    if fid not in f2g: f2g[fid] = {}
                    if (gri >=  0 and not gri == (ali+1)): 
                        grid = hitd['Hit_def'][(ali+1):gri]
                        f2g[fid][grid] = 1
                    for hspd in hitd['Hit_hsps']['Hsp']:
                        if fid in fm:
                            if float(hspd['Hsp_evalue']) < fm[fid]:
                                fm[fid] = float(hspd['Hsp_evalue'])
                        else: fm[fid] = float(hspd['Hsp_evalue'])
           
            fms = sorted(fm.items(), key=lambda x: x[1], reverse=False)
            bol = len(fms)
            if params['entries'] != "" or int(params['entries']) > 0:
                if(int(params['entries']) < bol):
                    bol = int(params['entries'])
            for i in range(bol):
                if(fms[i][1] > float(params['evalue'])): break
                if fms[i][0] in f2g:
                    fs['elements'][fms[i][0]] = f2g[fms[i][0]].keys()
                else:
                    fs['elements'][fms[i][0]] = []

        ws_client.save_objects(
            {"workspace":params['ws_id'],
            "objects": [{
                "type":"KBaseCollections.FeatureSet",
                "data":fs,
                "name":params['out_id']}
            ]})

        #pprint(fs)
        returnVal = {'obj_name' : params['out_id'], 'ws_id' : params['ws_id']}

        #END filter_BlastOutput

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_BlastOutput return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def generate_cummerbund_plots(self, ctx, cummerbundParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plots

        params    = cummerbundParams
        returnVal = params['ws_cummerbund_output']

        #Set up workspace client
        user_token = ctx['token']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : params['ws_cuffdiff_id'],
            'workspace' : params['workspace_name']
            }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        # Get input data Shock Id and Filename.
        cuffdiff_shock_id = s_res[0]['data']['file']['id']
        cuffdiff_file_name = s_res[0]['data']['file']['file_name']

        #cuffdiff_file_name =None 
        filesize = None

        # Download tar file
        dx = script_util.download_file_from_shock( self.__LOGGER, 
            self.__SHOCK_URL, cuffdiff_shock_id, cuffdiff_file_name,
            self.__SCRATCH, filesize, user_token)
    
        #Decompress tar file and keep it in a directory
        tarfile = join(self.__SCRATCH, cuffdiff_file_name)
        dstnExtractFolder = join(self.__SCRATCH, "cuffdiffData")
        if not os.path.exists(dstnExtractFolder):
            os.makedirs(dstnExtractFolder)

        untarStatus = script_util2.untar_files(self.__LOGGER, tarfile, dstnExtractFolder)
        if untarStatus == False:
            self.__LOGGER.info("Problem extracting the archive")
            return returnVal

        foldersinExtractFolder = os.listdir(dstnExtractFolder)

        if len(foldersinExtractFolder) == 0:
            self.__LOGGER.info("Problem extracting the archive")
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        cuffdiff_dir = join(dstnExtractFolder, foldersinExtractFolder[0])
	self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        # Prepare output object.
        outputobject=dict()

        # Prepare output plot list
        cummerbundplotset=[]

        # List of plots to generate
        plotlist = [
                { 'file': "dispersionplot.R",
                  'title': "Dispersion plot",
                  'description': "Dispersion plot" },
                { 'file': "pcaplot.R",
                  'title': "PCA plot",
                  'description': "PCA plot" },
                { 'file': "fpkmscvplot.R",
                  'title': "FPKM SCV plot",
                  'description': "FPKM SCV plot" }
            ]

        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token,
                cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])


        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        #TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace":params['workspace_name'],
            "objects": [{
                "type":"KBaseRNASeq.cummerbund_output",
                "data":outputobject,
                "name":params["ws_cummerbund_output"]}]
            })

        #END generate_cummerbund_plots

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plots return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def create_interactive_heatmap_de_genes(self, ctx, interactiveHeatmapParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_interactive_heatmap_de_genes
        fparams    = interactiveHeatmapParams
        #returnVal = "ttt"
        #Set up workspace client
        user_token = ctx['token']
        workspace = fparams['workspace_name']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)
        system_params = {}
        system_params['token'] = user_token
        system_params['ws_url'] =  self.__WS_URL
        system_params['logger'] =  self.__LOGGER
        system_params['shock_url'] =  self.__SHOCK_URL
        system_params['hs_url'] =  self.__HS_URL
        system_params['scratch'] =  self.__SCRATCH
        system_params['rscripts'] =  self.__RSCRIPTS
        system_params['workspace'] = workspace






        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : fparams['ws_cuffdiff_id'],
            'workspace' : fparams['workspace_name']
            }])

         #Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal
        cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff"
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)


        #if (cuffdiff_dir is False):
        #    return returnVal
        fparams['cuffdiff_dir'] = cuffdiff_dir
        fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff")
        fparams['outfile'] = join(system_params['scratch'],  "gene_exp.diff.filter")

        filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params)
        self.__LOGGER.info("matrix is " + filtered_matrix)

        fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter")
        fparams['outfile'] = join(system_params['scratch'],  "gene_exp.diff.filter.genelist")



        genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams)


        # Prepare output object.
        outjson = False;
 

        rparams = {}
        rparams['genelist'] = filtered_matrix
        rparams['cuffdiff_dir'] = fparams['cuffdiff_dir']
        rparams['outpng'] = join (system_params['scratch'], "heatmap.png")
        rparams['imageheight'] = 1600
        rparams['imagewidth'] = 800
        rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R")
        rparams['include_replicates'] = 1
        rparams['outmatrix'] = join (system_params['scratch'], "outmatrix")

        roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams)

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject=dict()




        # Prepare output plot list
        cummerbundplotset=[]

        # List of plots to generate
        plotlist = [
                  
                { 'roptstr': roptstr_basic_heatmap_rep,
                  'title': "Heatmap",
                  'description': "Heatmap", 
                  'exp' : fparams['ws_expression_matrix_id']
                  }

            ]
        fparams['cummerbundplotset'] = cummerbundplotset
        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            fparams['title'] = plot['title']
            fparams['description'] = plot['description']


            status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr'])
            if status == False:
                   self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"])
            else:
                  self.__LOGGER.info(status)

                  outjson = status
                  with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2:
                    eo2 = json.load(et2)
                    genome_ref = s_res[0]['data']['genome_id']
                    eo2['type']='untransformed'
                    #eo2['genome_ref'] = genome_ref
                    self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp'])
                    ws_client.save_objects({'workspace' : workspace,
                           'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix',
                           'data' : eo2,
                           'name' : plot['exp']
                     }]})

        returnVal = fparams['ws_expression_matrix_id']

        #END create_interactive_heatmap_de_genes

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method create_interactive_heatmap_de_genes return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def generate_cummerbund_plot2(self, ctx, cummerbundstatParams):
        """
        :param cummerbundstatParams: instance of type "cummerbundstatParams"
           -> structure: parameter "workspace" of String, parameter
           "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of
           type "ws_diffstat_output" (Differential stat workspace id)
        :returns: instance of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plot2
        params    = cummerbundstatParams
        returnVal = params['ws_cummerbund_output']

        #Set up workspace client
        user_token = ctx['token']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)


        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : params['ws_cuffdiff_id'],
            'workspace' : params['workspace']
            }])
        print "Getting genome info"

        genome_ref = s_res[0]['data']['genome_id']
        #genome_ref = '2702/6/2'
        #genome_ref = '2702/26/1'
        #genome_ref = '2229/21/10'
        print genome_ref
        gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token)
        genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}],
                                          "included_fields": ["scientific_name"],
                                          "included_feature_fields": ["id", "function", "type"
                                                                      ]})["genomes"][0]["data"]
        genome_dict = {}
        features = genome['features']
        for feature in features:
          id = feature['id']
          try: 
            function = feature['function']
            if not function:
              function = 'Unknown'
          except:
             function = 'Unknown'
          genome_dict[id] = function


        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject=dict()

        # Prepare output plot list
        cummerbundplotset=[]
        # List of plots to generate
        plotlist = [
                { 'file': "dispersionplot.R",
                  'title': "Dispersion plot",
                  'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." },


                { 'file': "fpkmscvplot.R",
                  'title': "Genes CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." },

                { 'file': "isoformscvplot.R",
                  'title': "Isoform CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." },

                { 'file': "densityplot.R",
                  'title': "Density plot",
                  'description': "The density plot shows the distribution of FPKM scores across samples" },

                { 'file': "csdensityrepplot.R",
                  'title': "Replicates density plot",
                  'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" },

                { 'file': "boxplot.R",
                  'title': "Box plots",
                  'description': "The box plots show the FPKM distribution across samples." },

                { 'file': "boxrepplot.R",
                  'title': "Box plots of replicates",
                  'description': "The box plots of replicates show the FPKM distribution across sample replicates." },

                { 'file': "pairwisescatterplots.R",
                  'title': "Pairwise scatter plots",
                  'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." },

                 { 'file': "volcanomatrixplot.R",
                  'title': "Volcano matrix plots",
                  'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." },

                { 'file': "pcaplot.R",
                  'title': "PCA plot",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." },

                { 'file': "pcarepplot.R",
                  'title': "PCA plot including replicates",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." },

                { 'file': "mdsplot.R",
                  'title': "Multi-dimensional scaling plot",
                  'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " },

                { 'file': "mdsrepplot.R",
                  'title': "Multi-dimensional scaling plot including replicates",
                  'description': "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." }
            ]


        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token,
                cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])


        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        #TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace":params['workspace'],
            "objects": [{
                "type":"KBaseRNASeq.cummerbund_output",
                "data":outputobject,
                "name":params["ws_cummerbund_output"]}]
            })

        infile =  join(cuffdiff_dir, "gene_exp.diff") 
        outfile = join(cuffdiff_dir, "gene_exp_diff.out") 
        x=v.volcano_plot_data_parse_and_upload(infile,outfile, genome_dict)
        with open(outfile) as f:
            statdata = json.load(f)
        res = ws_client.save_objects({
            "workspace":params['workspace'],
            "objects": [{
                "type":"KBaseRNASeq.DifferentialExpressionStat",
                "data":statdata,
                "name":params["ws_diffstat_output"]}]
            })

        #END generate_cummerbund_plot2

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plot2 return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams):
        """
        :param heatmapParams: instance of type "heatmapParams" -> structure:
           parameter "workspace" of String, parameter "sample1" of String,
           parameter "sample2" of String, parameter "q_value_cutoff" of
           Double, parameter "log2_fold_change_cutoff" of Double, parameter
           "num_genes" of Long, parameter "ws_cuffdiff_id" of type
           "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id
           ws KBaseFeatureValues.ExpressionMatrix), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        :returns: instance of type "ResultsToReport" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_interactive_heatmap_de_genes_old
        fparams    = heatmapParams
        #returnVal = "ttt"
        #Set up workspace client
        user_token = ctx['token']
        workspace = fparams['workspace']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)
        system_params = {}
        system_params['token'] = user_token
        system_params['ws_url'] =  self.__WS_URL
        system_params['logger'] =  self.__LOGGER
        system_params['shock_url'] =  self.__SHOCK_URL
        system_params['hs_url'] =  self.__HS_URL
        system_params['scratch'] =  self.__SCRATCH
        system_params['rscripts'] =  self.__RSCRIPTS
        system_params['workspace'] = workspace




        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : fparams['ws_cuffdiff_id'],
            'workspace' : fparams['workspace']
            }])

         #Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal
        cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        #cuffdiff_dir = "/kb/module/work/cuffdiffData/cuffdiff"
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)


        #if (cuffdiff_dir is False):
        #    return returnVal
        fparams['cuffdiff_dir'] = cuffdiff_dir
        fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff")
        fparams['outfile'] = join(system_params['scratch'],  "gene_exp.diff.filter")


        fparams['pairs']=1
        fparams['logModetmp'] = 2



        rparams = {}
        
        rparams['cuffdiff_dir'] = fparams['cuffdiff_dir']
        rparams['outpng'] = join (system_params['scratch'], "heatmap.png")
        rparams['imageheight'] = 1600
        rparams['imagewidth'] = 800
        rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R")
        rparams['include_replicates'] = 1
        rparams['pairs'] = fparams ['pairs']
        rparams['logMode'] = fparams['logModetmp']
        rparams['removezeroes'] = 1
        rparams['outmatrix'] = join (system_params['scratch'], "outmatrix")
        reportObj = {}

        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects']=[workspace+'/'+fparams['ws_cuffdiff_id']]
       
        report = ""
        if (fparams['pairs'] != 0):
        
           try:
                filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params)
                self.__LOGGER.info("matrix is " + filtered_matrix)
                fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter")
                fparams['outfile'] = join(system_params['scratch'],  "gene_exp.diff.filter.genelist")
                genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams)
                rparams['genelist'] = filtered_matrix
           except:
                report += "There was an error in creating expression matrix"
                report += "No differentially expressed genes were found"
                report += "Please change / double check  your filtering criteria"

	        reportObj = {
		    'objects_created':[],
		    'text_message':report
		}

		reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode()))
		report_info = ws_client.save_objects({
		    'workspace':fparams['workspace'],
		    'objects':[
			 {
			  'type':'KBaseReport.Report',
			  'data':reportObj,
			  'name':reportName,
			  'meta':{},
			  'hidden':1, # important!  make sure the report is hidden
			  'provenance':provenance
			 }
		    ] })[0]  
		print('saved Report: '+pformat(report_info))

		returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) }

                return [returnVal]


        try:
	    # Prepare output object.
	    outjson = False;
     

	    roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams)

	    # Run R script to run cummerbund json and update the cummerbund output json file
	    # Prepare output object.
	    outputobject=dict()




	    # Prepare output plot list
	    cummerbundplotset=[]

	    # List of plots to generate
	    plotlist = [
		      
		    { 'roptstr': roptstr_basic_heatmap_rep,
		      'title': "Heatmap",
		      'description': "Heatmap", 
		      'exp' : fparams['ws_expression_matrix_id']
		      }

		]
	    fparams['cummerbundplotset'] = cummerbundplotset
	    # Iterate through the plotlist and generate the images and json files.
	    for plot in plotlist:
		fparams['title'] = plot['title']
		fparams['description'] = plot['description']


		status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr'])
		if status == False:
                    self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"])
                    report = "Error: Please select a different cutoff criteria. None of the genes passed fold change and q-value-cutoff. "
                    report += "Failed to create expression  matrix with differentially expressed genes(" +  fparams['ws_expression_matrix_id'] + "). No genes to show on heatmap."
                    reportObj = {
                    'objects_created':[],
                    'text_message':report
                    }
                    reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode()))
                    report_info = ws_client.save_objects({
                        'workspace':fparams['workspace'],
                        'objects':[
                        {
                        'type':'KBaseReport.Report',
                        'data':reportObj,
                        'name':reportName,
                        'meta':{},
                        'hidden':1, # important!  make sure the report is hidden
                        'provenance':provenance
                    }
                    ] })[0]  
                    print('saved Report: '+pformat(report_info))

                    returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) }

                    return [returnVal]


		else:
                      
		      self.__LOGGER.info(status)

		      outjson = status
		      self.__LOGGER.info('5')
		      with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2:

			eo2 = json.load(et2)
			genome_ref = s_res[0]['data']['genome_id']
			eo2['type']='log2_level'
			eo2['genome_ref'] = genome_ref
		        self.__LOGGER.info('3')
			self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp'])
                        try:
                            res = ws_client.save_objects({'workspace' : workspace,
                                   'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                   'data' : eo2,
                                   'name' : plot['exp']
                             }]})
                        except:
                            self.__LOGGER ("xxxx6")

        except:
		self.__LOGGER.info('6')
        report = "Successfully created expression matrix"
        reportObj = {
             'objects_created':[],
             'text_message':report
              }

        self.__LOGGER.info('7')

	reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode()))
	report_info = ws_client.save_objects({
	    'workspace':fparams['workspace'],
	    'objects':[
		 {
		  'type':'KBaseReport.Report',
		  'data':reportObj,
		  'name':reportName,
		  'meta':{},
		  'hidden':1, # important!  make sure the report is hidden
		  'provenance':provenance
		 }
	    ] })[0]  
	print('saved Report: '+pformat(report_info))

	returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) }


        #END create_interactive_heatmap_de_genes_old

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method create_interactive_heatmap_de_genes_old return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def create_interactive_heatmap_de_genes(self, ctx,
                                            interactiveHeatmapParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_interactive_heatmap_de_genes
        fparams = interactiveHeatmapParams
        #returnVal = "ttt"
        #Set up workspace client
        user_token = ctx['token']
        workspace = fparams['workspace_name']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)
        system_params = {}
        system_params['token'] = user_token
        system_params['ws_url'] = self.__WS_URL
        system_params['logger'] = self.__LOGGER
        system_params['shock_url'] = self.__SHOCK_URL
        system_params['hs_url'] = self.__HS_URL
        system_params['scratch'] = self.__SCRATCH
        system_params['rscripts'] = self.__RSCRIPTS
        system_params['workspace'] = workspace

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': fparams['ws_cuffdiff_id'],
            'workspace': fparams['workspace_name']
        }])

        #Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal
        cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data(
            self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff"
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        #if (cuffdiff_dir is False):
        #    return returnVal
        fparams['cuffdiff_dir'] = cuffdiff_dir
        fparams['infile'] = join(cuffdiff_dir, "gene_exp.diff")
        fparams['outfile'] = join(system_params['scratch'],
                                  "gene_exp.diff.filter")

        filtered_matrix = script_util2.filter_expression_matrix(
            fparams, system_params)
        self.__LOGGER.info("matrix is " + filtered_matrix)

        fparams['infile'] = join(system_params['scratch'],
                                 "gene_exp.diff.filter")
        fparams['outfile'] = join(system_params['scratch'],
                                  "gene_exp.diff.filter.genelist")

        genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(
            fparams)

        # Prepare output object.
        outjson = False

        rparams = {}
        rparams['genelist'] = filtered_matrix
        rparams['cuffdiff_dir'] = fparams['cuffdiff_dir']
        rparams['outpng'] = join(system_params['scratch'], "heatmap.png")
        rparams['imageheight'] = 1600
        rparams['imagewidth'] = 800
        rparams['plotscript'] = join(system_params['rscripts'],
                                     "heatmapplotinteractive.R")
        rparams['include_replicates'] = 1
        rparams['outmatrix'] = join(system_params['scratch'], "outmatrix")

        roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(
            rparams)

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject = dict()

        # Prepare output plot list
        cummerbundplotset = []

        # List of plots to generate
        plotlist = [{
            'roptstr': roptstr_basic_heatmap_rep,
            'title': "Heatmap",
            'description': "Heatmap",
            'exp': fparams['ws_expression_matrix_id']
        }]
        fparams['cummerbundplotset'] = cummerbundplotset
        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            fparams['title'] = plot['title']
            fparams['description'] = plot['description']

            status = script_util2.rplotanduploadinteractive(
                system_params, fparams, rparams, plot['roptstr'])
            if status == False:
                self.__LOGGER.info(
                    "Problem generating image and json file - " +
                    plot["roptstr"])
            else:
                self.__LOGGER.info(status)

                outjson = status
                with open("{0}/{1}".format(self.__SCRATCH, outjson),
                          'r') as et2:
                    eo2 = json.load(et2)
                    genome_ref = s_res[0]['data']['genome_id']
                    eo2['type'] = 'untransformed'
                    #eo2['genome_ref'] = genome_ref
                    self.__LOGGER.info(workspace + self.__SCRATCH + outjson +
                                       plot['exp'])
                    ws_client.save_objects({
                        'workspace':
                        workspace,
                        'objects': [{
                            'type': 'KBaseFeatureValues.ExpressionMatrix',
                            'data': eo2,
                            'name': plot['exp']
                        }]
                    })

        returnVal = fparams['ws_expression_matrix_id']

        #END create_interactive_heatmap_de_genes

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError(
                'Method create_interactive_heatmap_de_genes return value ' +
                'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def generate_cummerbund_plots(self, ctx, cummerbundParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plots

        params = cummerbundParams
        returnVal = params['ws_cummerbund_output']

        #Set up workspace client
        user_token = ctx['token']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': params['ws_cuffdiff_id'],
            'workspace': params['workspace_name']
        }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data(
            self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject = dict()

        # Prepare output plot list
        cummerbundplotset = []

        # List of plots to generate
        plotlist = [{
            'file':
            "dispersionplot.R",
            'title':
            "Dispersion plot",
            'description':
            "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM."
        }, {
            'file':
            "fpkmscvplot.R",
            'title':
            "Genes CV plot",
            'description':
            "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data."
        }, {
            'file':
            "isoformscvplot.R",
            'title':
            "Isoform CV plot",
            'description':
            "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates."
        }, {
            'file':
            "densityplot.R",
            'title':
            "Density plot",
            'description':
            "The density plot shows the distribution of FPKM scores across samples"
        }, {
            'file':
            "csdensityrepplot.R",
            'title':
            "Replicates density plot",
            'description':
            "The replicates density plot shows the distribution of FPKM scores across sample replicates"
        }, {
            'file':
            "boxplot.R",
            'title':
            "Box plots",
            'description':
            "The box plots show the FPKM distribution across samples."
        }, {
            'file':
            "boxrepplot.R",
            'title':
            "Box plots of replicates",
            'description':
            "The box plots of replicates show the FPKM distribution across sample replicates."
        }, {
            'file':
            "pairwisescatterplots.R",
            'title':
            "Pairwise scatter plots",
            'description':
            "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line."
        }, {
            'file':
            "volcanomatrixplot.R",
            'title':
            "Volcano matrix plots",
            'description':
            "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off."
        }, {
            'file':
            "pcaplot.R",
            'title':
            "PCA plot",
            'description':
            "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions."
        }, {
            'file':
            "pcarepplot.R",
            'title':
            "PCA plot including replicates",
            'description':
            "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates."
        }, {
            'file':
            "mdsplot.R",
            'title':
            "Multi-dimensional scaling plot",
            'description':
            "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. "
        }, {
            'file':
            "mdsrepplot.R",
            'title':
            "Multi-dimensional scaling plot including replicates",
            'description':
            "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions."
        }]

        #TODO.. Giving Rplot.pdf
        #                { 'file': "dendrogramplot.R",
        #                  'title': "Dendrogram",
        #                  'description': "Dendrogram  based on the JS (Jensen-Shannon divergence) distance" },
        #
        #                { 'file': "dendrogramrepplot.R",
        #                  'title': "Dendrogram including replicates",
        #                  'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" },

        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(
                self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'],
                self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset,
                plot['title'], plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info(
                    "Problem generating image and json file - " + plot["file"])

        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        #TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace":
            params['workspace_name'],
            "objects": [{
                "type": "KBaseRNASeq.cummerbund_output",
                "data": outputobject,
                "name": params["ws_cummerbund_output"]
            }]
        })

        #END generate_cummerbund_plots

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plots return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def create_expression_matrix(self, ctx, expressionMatrixParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_expression_matrix

        params = expressionMatrixParams
        returnVal = params['ws_expression_matrix_id']
        #Set up workspace client
        user_token = ctx['token']
        workspace = params['workspace_name']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': params['ws_cuffdiff_id'],
            'workspace': params['workspace_name']
        }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data(
            self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to get fpkmgenematrix.R

        # Prepare output object.
        outjson = False
        #outjson = "repfpkmgenematrix.R.matrix.txt.json";

        if params['include_replicates'] == 0:
            scriptfile = "fpkmgenematrix.R"
            outjson = script_util2.generate_and_upload_expression_matrix(
                self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile,
                self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir,
                self.__WS_URL, workspace)

        else:
            scriptfile = "repfpkmgenematrix.R"
            outjson = script_util2.generate_and_upload_expression_matrix(
                self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile,
                self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir,
                self.__WS_URL, workspace)

        if outjson is False:
            self.__LOGGER.info("Creation of expression matrix failed")
            return returnVal
        with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et:
            eo = json.load(et)
        eo['type'] = 'untransformed'
        genome_ref = s_res[0]['data']['genome_id']
        #eo['genome_ref'] = genome_ref

        self.__LOGGER.info(workspace + self.__SCRATCH + outjson +
                           params['ws_expression_matrix_id'])
        ws_client.save_objects({
            'workspace':
            workspace,
            'objects': [{
                'type': 'KBaseFeatureValues.ExpressionMatrix',
                'data': eo,
                'name': params['ws_expression_matrix_id']
            }]
        })

        #END create_expression_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method create_expression_matrix return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
    def load_new_genome_data(self, ctx, params):
        """
        :param params: instance of type "LoadNewGenomeDataParams" ->
           structure: parameter "genome_ref" of String
        :returns: instance of type "GenomeData" (scientific_name - scientific
           name of the organism. taxonomy_id - NCBI taxonomic id of the
           organism. kingdom - taxonomic kingdom of the organism.
           scientific_lineage - scientific lineage of the organism.
           genetic_code - scientific name of the organism. organism_aliases -
           aliases for the organism associated with this GenomeAnnotation.
           assembly_source - source organization for the Assembly.
           assembly_source_id - identifier for the Assembly used by the
           source organization. assembly_source_date - date of origin the
           source indicates for the Assembly. gc_content - GC content for the
           entire Assembly. dna_size - total DNA size for the Assembly.
           num_contigs - number of contigs in the Assembly. contig_ids -
           contig identifier strings for the Assembly. external_source - name
           of the external source. external_source_date - date of origin the
           external source indicates for this GenomeAnnotation. release -
           release version for this GenomeAnnotation data.
           original_source_filename - name of the file used to generate this
           GenomeAnnotation. feature_type_counts - number of features of each
           type.) -> structure: parameter "scientific_name" of String,
           parameter "taxonomy_id" of Long, parameter "kingdom" of String,
           parameter "scientific_lineage" of list of String, parameter
           "genetic_code" of Long, parameter "organism_aliases" of list of
           String, parameter "assembly_source" of String, parameter
           "assembly_source_id" of String, parameter "assembly_source_date"
           of String, parameter "gc_content" of Double, parameter "dna_size"
           of Long, parameter "num_contigs" of Long, parameter "contig_ids"
           of list of String, parameter "external_source" of String,
           parameter "external_source_date" of String, parameter "release" of
           String, parameter "original_source_filename" of String, parameter
           "feature_type_counts" of mapping from String to Long, parameter
           "features" of list of type "FeatureData" (feature_id - identifier
           for this feature feature_type - the Feature type e.g., "mRNA",
           "CDS", "gene", ... feature_function - the functional annotation
           description feature_aliases - dictionary of Alias string to List
           of source string identifiers feature_dna_sequence_length - integer
           representing the length of the DNA sequence for convenience
           feature_dna_sequence - string containing the DNA sequence of the
           Feature feature_md5 - string containing the MD5 of the sequence,
           calculated from the uppercase string feature_locations - list of
           Feature regions, where the Feature bounds are calculated as
           follows: - For "+" strand, [start, start + length) - For "-"
           strand, (start - length, start] feature_publications - ist of any
           known publications related to this Feature
           feature_quality_warnings - list of strings indicating known data
           quality issues (note: not used for Genome type, but is used for
           GenomeAnnotation) feature_quality_score - quality value with
           unknown algorithm for Genomes, not calculated yet for
           GenomeAnnotations. feature_notes - notes recorded about this
           Feature feature_inference - inference information) -> structure:
           parameter "feature_id" of String, parameter "feature_type" of
           String, parameter "feature_function" of String, parameter
           "feature_aliases" of mapping from String to list of String,
           parameter "feature_dna_sequence_length" of Long, parameter
           "feature_dna_sequence" of String, parameter "feature_md5" of
           String, parameter "feature_locations" of list of type "Region"
           (contig_id - the identifier for the contig to which this region
           corresponds. strand - either a "+" or a "-", for the strand on
           which the region is located. start - starting position for this
           region. length - distance from the start position that bounds the
           end of the region.) -> structure: parameter "contig_id" of String,
           parameter "strand" of String, parameter "start" of Long, parameter
           "length" of Long, parameter "feature_publications" of list of
           String, parameter "feature_quality_warnings" of list of String,
           parameter "feature_quality_score" of list of String, parameter
           "feature_notes" of String, parameter "feature_inference" of
           String, parameter "protein" of type "ProteinData" (protein_id -
           protein identifier, which is feature ID plus ".protein"
           protein_amino_acid_sequence - amino acid sequence for this protein
           protein_function - function of protein protein_aliases - list of
           aliases for the protein protein_md5 - MD5 hash of the protein
           translation (uppercase)) -> structure: parameter "protein_id" of
           String, parameter "protein_amino_acid_sequence" of String,
           parameter "protein_function" of String, parameter
           "protein_aliases" of list of String, parameter "protein_md5" of
           String, parameter "protein_domain_locations" of list of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN load_new_genome_data
        genome_ref = params['genome_ref']
        ga = GenomeAnnotationAPI(self.services, ctx['token'], genome_ref)
        feature_types = ga.get_feature_types()
        feature_ids_by_type = ga.get_feature_ids({"type_list": feature_types})
        feature_ids = []
        feature_id_map = feature_ids_by_type['by_type']
        for feature_type in feature_id_map:
            feature_ids.extend(feature_id_map[feature_type])
        feature_map = ga.get_features(feature_ids)
        protein_map = ga.get_proteins()
        features = []
        proteins = []
        for feature_id in feature_map:
            feature = feature_map[feature_id]
            if feature_id in protein_map:
                protein = protein_map[feature_id]
                feature['protein'] = protein
                proteins.append(protein)
            features.append(feature)
        #genome_data = ga.get_summary()    # It returnes None !!! Maybe something wasn't prepared at the end of upload from Genbank?
        # Temporary load genome summary from directly from Workspace (there are some fields not present)
        ws = Workspace(url=self.workspaceURL)
        genome_data = ws.get_objects([{"ref": genome_ref}])[0]["data"]
        genome_data.pop('publications', None)
        genome_data.pop('feature_lookup', None)
        if 'scientific_name' not in genome_data and 'display_sc_name' in genome_data:
            genome_data['scientific_name'] = genome_data['display_sc_name']
        genome_data['features'] = features
        returnVal = genome_data
        #END load_new_genome_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method load_new_genome_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Beispiel #29
0
def get_object(name):
    WS_URL = 'https://ci.kbase.us/services/ws/'
    from biokbase.workspace.client import Workspace
    ws = Workspace(WS_URL)
    return ws.get_objects(
        [dict(workspace=os.environ['KB_WORKSPACE_ID'], name=name)])[0]['data']
    def runAnnotate(self, job):

        ''' Run an annotate job to create a ProbAnno typed object.

            A ProbAnno typed object is created in four steps: (1) extract amino acid
            sequences from a Genome typed object to a fasta file, (2) run a BLAST search
            using the amino acid sequences against the subsystem BLAST database,
            (3) calculate annotation likelihood scores for each roleset implied by the
            functions of proteins in subsystems, and (4) save the likelihood scores
            to a ProbAnno typed object.

            The Job dictionary contains three main sections: (1) input parameters to
            the annotate() function, (2) context of server instance running the
            annotate() function, and (3) config variables of server.

            @param job Job dictionary created by server's annotate() function
            @return Nothing (although job is marked as complete)
        '''

        # The input parameters and user context for annotate() were stored in the job data for the job.
        input = job["input"]
        if input['verbose']:
            self.logger.set_log_level(log.DEBUG)
        self.ctx = job["context"]
        self.config = job['config']

        # Create a DataParser object for working with the static database files.
        self.dataParser = DataParser(self.config)

        status = None

        try:
            # Make sure the database files are available.
            self.dataParser.checkIfDatabaseFilesExist()

            # Make sure the job directory exists.
            workFolder = make_job_directory(self.config['work_folder_path'], job['id'])

            # Create a user and job state client and authenticate as the user.
            ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.ctx['token'])
    
            # Get the Genome object from the specified workspace.
            try:
                ujsClient.update_job_progress(job['id'], self.ctx['token'], 'getting genome object', 1, timestamp(3600))
            except:
                pass
            wsClient = Workspace(self.config["workspace_url"], token=self.ctx['token'])
            genomeObjectId = make_object_identity(input["genome_workspace"], input["genome"])
            objectList = wsClient.get_objects( [ genomeObjectId ] )
            genomeObject = objectList[0]
            
            # Convert Genome object to fasta file.
            try:
                ujsClient.update_job_progress(job['id'], self.ctx['token'], 'converting Genome object to fasta file', 1, timestamp(3600))
            except:
                pass
            fastaFile = self._genomeToFasta(input, genomeObject, workFolder)
            
            # Run blast using the fasta file.
            try:
                ujsClient.update_job_progress(job['id'], self.ctx['token'], 'running blast', 1, timestamp(3600))
            except:
                pass
            blastResultFile = self._runBlast(input, fastaFile, workFolder)
            
            # Calculate roleset probabilities.
            try:
                ujsClient.update_job_progress(job['id'], self.ctx['token'], 'calculating roleset probabilities', 1, timestamp(300))
            except:
                pass
            rolestringTuples = self._rolesetProbabilitiesMarble(input, blastResultFile, workFolder)
            
            # Build ProbAnno object and store in the specified workspace.
            try:
                ujsClient.update_job_progress(job['id'], self.ctx['token'], 'building ProbAnno object', 1, timestamp(120))
            except:
                pass
            output = self._buildProbAnnoObject(input, genomeObject, blastResultFile, rolestringTuples, workFolder, wsClient)

            # Mark the job as done.
            status = "done"
            tb = None
            self._log(log.INFO, 'Job '+job['id']+' finished for genome '+input['genome']+' to probanno '+input['probanno'])

        except:
            tb = traceback.format_exc()
            sys.stderr.write('\n'+tb)
            status = "failed"
            self._log(log.ERR, 'Job '+job['id']+' failed for genome '+input['genome']+' to probanno '+input['probanno'])
        
        # Mark the job as complete with the given status.
        ujsClient.complete_job(job['id'], self.ctx['token'], status, tb, { })

        # Remove the temporary work directory.
        if self.logger.get_log_level() < log.DEBUG2 and status == 'done':
            try:
                shutil.rmtree(workFolder)
            except OSError:
                # For some reason deleting the directory was failing in production. Rather than have all jobs look like they failed
                # I catch and log the exception here (since the user still gets the same result if the directory remains intact)
                msg = 'Unable to delete temporary directory %s\n' %(workFolder)
                sys.stderr.write('WARNING: '+msg)
                self._log(log.WARNING, msg)

        return
Beispiel #31
0
def run_filter_genes(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_filter
    cmd_coex_filter = [
        COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s',
        "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x',
        "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'
    ]
    if 'num_features' in param:
        cmd_coex_filter.append("-n")
        cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
        cmd_coex_filter.append("-p")
        cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
        logger.error("One of p_value or num_features must be defined")
        sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(
            fl)  # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [
        TSV_2_FVE, '--workspace_service_url', workspace_service_url,
        '--object_name', param['out_expr_object_name'], '--working_directory',
        FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name',
        FINAL_FN
    ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new(
            {"objects": [{
                'ref': expr['genome_ref']
            }]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws,
                                              obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et:
        eo = json.load(et)

    if 'description' in expr:
        expr['description'] = "{0}, coex_filter by {1}".format(
            expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.ExpressionMatrix',
            'data': expr,
            'name': (param['out_expr_object_name'])
        }]
    })

    ## Upload FeatureSet
    fs = {
        'description':
        'Differentially expressed genes generated by {0}'.format(
            " ".join(cmd_coex_filter)),
        'elements': {}
    }

    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh:
        gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
        if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
        else:
            fs['elements'][g] = []

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseCollections.FeatureSet',
            'data': fs,
            'name': (param['out_fs_object_name'])
        }]
    })
    def create_expression_matrix(self, ctx, expressionMatrixParams):
        """
        :param expressionMatrixParams: instance of type
           "expressionMatrixParams" -> structure: parameter "workspace_name"
           of type "workspace_name" (workspace name of the object), parameter
           "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id
           ws KBaseFeatureValues.ExpressionMatrix), parameter
           "include_replicates" of type "bool" (indicates true or false
           values, false <= 0, true >=1)
        :returns: instance of type "ws_expression_matrix_id" (@id ws
           KBaseFeatureValues.ExpressionMatrix)
        """
        # ctx is the context object
        # return variables are: returnVal
        # BEGIN create_expression_matrix

        params = expressionMatrixParams
        returnVal = params["ws_expression_matrix_id"]
        # Set up workspace client
        user_token = ctx["token"]
        workspace = params["workspace_name"]
        ws_client = Workspace(url=self.__WS_URL, token=user_token)

        # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{"name": params["ws_cuffdiff_id"], "workspace": params["workspace_name"]}])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data(
            self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token
        )
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if cuffdiff_dir is False:
            return returnVal

        # Run R script to get fpkmgenematrix.R

        # Prepare output object.
        outjson = False
        # outjson = "repfpkmgenematrix.R.matrix.txt.json";

        if params["include_replicates"] == 0:
            scriptfile = "fpkmgenematrix.R"
            outjson = script_util2.generate_and_upload_expression_matrix(
                self.__LOGGER,
                self.__SCRATCH,
                self.__RSCRIPTS,
                scriptfile,
                self.__SHOCK_URL,
                self.__HS_URL,
                user_token,
                cuffdiff_dir,
                self.__WS_URL,
                workspace,
            )

        else:
            scriptfile = "repfpkmgenematrix.R"
            outjson = script_util2.generate_and_upload_expression_matrix(
                self.__LOGGER,
                self.__SCRATCH,
                self.__RSCRIPTS,
                scriptfile,
                self.__SHOCK_URL,
                self.__HS_URL,
                user_token,
                cuffdiff_dir,
                self.__WS_URL,
                workspace,
            )

        if outjson is False:
            self.__LOGGER.info("Creation of expression matrix failed")
            return returnVal
        with open("{0}/{1}".format(self.__SCRATCH, outjson), "r") as et:
            eo = json.load(et)
        eo["type"] = "untransformed"
        genome_ref = s_res[0]["data"]["genome_id"]
        eo["genome_ref"] = genome_ref

        self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params["ws_expression_matrix_id"])
        ws_client.save_objects(
            {
                "workspace": workspace,
                "objects": [
                    {
                        "type": "KBaseFeatureValues.ExpressionMatrix",
                        "data": eo,
                        "name": params["ws_expression_matrix_id"],
                    }
                ],
            }
        )

        # END create_expression_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError(
                "Method create_expression_matrix return value " + "returnVal is not type basestring as required."
            )
        # return the results
        return [returnVal]
Beispiel #33
0
    def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams):
        """
        :param heatmapParams: instance of type "heatmapParams" -> structure:
           parameter "workspace" of String, parameter "sample1" of String,
           parameter "sample2" of String, parameter "q_value_cutoff" of
           Double, parameter "log2_fold_change_cutoff" of Double, parameter
           "num_genes" of Long, parameter "ws_cuffdiff_id" of type
           "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id
           ws KBaseFeatureValues.ExpressionMatrix), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        :returns: instance of type "ResultsToReport" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_interactive_heatmap_de_genes_old
        fparams = heatmapParams
        returnVal = "ttt"
        # Set up workspace client
        user_token = ctx['token']
        workspace = fparams['workspace']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)
        system_params = {}
        system_params['token'] = user_token
        system_params['ws_url'] = self.__WS_URL
        system_params['logger'] = self.__LOGGER
        system_params['shock_url'] = self.__SHOCK_URL
        system_params['hs_url'] = self.__HS_URL
        system_params['scratch'] = self.__SCRATCH
        system_params['rscripts'] = self.__RSCRIPTS
        system_params['workspace'] = workspace

        # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': fparams['ws_cuffdiff_id'],
            'workspace': fparams['workspace']
        }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal
        cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL,
                                                          self.__SCRATCH, s_res, user_token)
        # cuffdiff_dir = "/kb/module/work/cuffdiffData/cuffdiff"
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        # if (cuffdiff_dir is False):
        #    return returnVal
        fparams['cuffdiff_dir'] = cuffdiff_dir
        fparams['infile'] = join(cuffdiff_dir, "gene_exp.diff")
        fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter")

        fparams['pairs'] = 1
        fparams['logModetmp'] = 2

        rparams = {}

        rparams['cuffdiff_dir'] = fparams['cuffdiff_dir']
        rparams['outpng'] = join(system_params['scratch'], "heatmap.png")
        rparams['imageheight'] = 1600
        rparams['imagewidth'] = 800
        rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R")
        rparams['include_replicates'] = 1
        rparams['pairs'] = fparams['pairs']
        rparams['logMode'] = fparams['logModetmp']
        rparams['removezeroes'] = 1
        rparams['outmatrix'] = join(system_params['scratch'], "outmatrix")
        reportObj = {}

        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [workspace + '/' + fparams['ws_cuffdiff_id']]

        report = ""
        if (fparams['pairs'] != 0):

            try:
                filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params)
                self.__LOGGER.info("matrix is " + filtered_matrix)
                fparams['infile'] = join(system_params['scratch'], "gene_exp.diff.filter")
                fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist")
                genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams)
                rparams['genelist'] = filtered_matrix
            except:
                report += "There was an error in creating expression matrix"
                report += "No differentially expressed genes were found"
                report += "Please change / double check  your filtering criteria"

                reportObj = {
                    'objects_created': [],
                    'text_message': report
                }

                reportName = 'create_interactive_heatmap_de_genes_old_' + str(hex(uuid.getnode()))
                report_info = ws_client.save_objects({
                    'workspace': fparams['workspace'],
                    'objects': [
                        {
                            'type': 'KBaseReport.Report',
                            'data': reportObj,
                            'name': reportName,
                            'meta': {},
                            'hidden': 1,  # important!  make sure the report is hidden
                            'provenance': provenance
                        }
                    ]})[0]
                print('saved Report: ' + pformat(report_info))

                returnVal = {"report_name": reportName,
                             "report_ref": str(report_info[6]) + '/' + str(
                                 report_info[0]) + '/' + str(report_info[4])}

                return [returnVal]

        try:
            # Prepare output object.
            outjson = False;

            roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(rparams)

            # Run R script to run cummerbund json and update the cummerbund output json file
            # Prepare output object.
            outputobject = dict()

            # Prepare output plot list
            cummerbundplotset = []

            # List of plots to generate
            plotlist = [

                {'roptstr': roptstr_basic_heatmap_rep,
                 'title': "Heatmap",
                 'description': "Heatmap",
                 'exp': fparams['ws_expression_matrix_id']
                 }

            ]
            fparams['cummerbundplotset'] = cummerbundplotset
            # Iterate through the plotlist and generate the images and json files.
            for plot in plotlist:
                fparams['title'] = plot['title']
                fparams['description'] = plot['description']

                status = script_util2.rplotanduploadinteractive(system_params, fparams, rparams,
                                                                plot['roptstr'])
                if status == False:
                    self.__LOGGER.info(
                        "Problem generating image and json file - " + plot["roptstr"])
                    report = "Error: Please select a different cutoff criteria. None of the genes passed fold change and q-value-cutoff. "
                    report += "Failed to create expression  matrix with differentially expressed genes(" + \
                              fparams['ws_expression_matrix_id'] + "). No genes to show on heatmap."
                    reportObj = {
                        'objects_created': [],
                        'text_message': report
                    }
                    reportName = 'create_interactive_heatmap_de_genes_old_' + str(
                        hex(uuid.getnode()))
                    report_info = ws_client.save_objects({
                        'workspace': fparams['workspace'],
                        'objects': [
                            {
                                'type': 'KBaseReport.Report',
                                'data': reportObj,
                                'name': reportName,
                                'meta': {},
                                'hidden': 1,  # important!  make sure the report is hidden
                                'provenance': provenance
                            }
                        ]})[0]
                    print('saved Report: ' + pformat(report_info))

                    returnVal = {"report_name": reportName,
                                 "report_ref": str(report_info[6]) + '/' + str(
                                     report_info[0]) + '/' + str(report_info[4])}

                    return [returnVal]


                else:

                    self.__LOGGER.info(status)

                    outjson = status
                    self.__LOGGER.info('5')
                    with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et2:

                        eo2 = json.load(et2)
                        genome_ref = s_res[0]['data']['genome_id']
                        eo2['type'] = 'log2_level'
                        eo2['genome_ref'] = genome_ref
                        self.__LOGGER.info('3')
                        self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp'])
                        try:
                            res = ws_client.save_objects({'workspace': workspace,
                                                          'objects': [{
                                                                          'type': 'KBaseFeatureValues.ExpressionMatrix',
                                                                          'data': eo2,
                                                                          'name': plot['exp']
                                                                          }]})
                        except:
                            self.__LOGGER("xxxx6")

        except:
            self.__LOGGER.info('6')
        report = "Successfully created expression matrix"
        reportObj = {
            'objects_created': [],
            'text_message': report
        }

        self.__LOGGER.info('7')

        reportName = 'create_interactive_heatmap_de_genes_old_' + str(hex(uuid.getnode()))
        report_info = ws_client.save_objects({
            'workspace': fparams['workspace'],
            'objects': [
                {
                    'type': 'KBaseReport.Report',
                    'data': reportObj,
                    'name': reportName,
                    'meta': {},
                    'hidden': 1,  # important!  make sure the report is hidden
                    'provenance': provenance
                }
            ]})[0]
        print('saved Report: ' + pformat(report_info))

        returnVal = {"report_name": reportName,
                     "report_ref": str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(
                         report_info[4])}

        #END create_interactive_heatmap_de_genes_old

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method create_interactive_heatmap_de_genes_old return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def create_expression_matrix(self, ctx, expressionMatrixParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_expression_matrix


        params    = expressionMatrixParams
        returnVal = params['ws_expression_matrix_id']
        #Set up workspace client
        user_token = ctx['token']
        workspace = params['workspace_name']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : params['ws_cuffdiff_id'],
            'workspace' : params['workspace_name']
            }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to get fpkmgenematrix.R

        # Prepare output object.
        outjson = False;
        #outjson = "repfpkmgenematrix.R.matrix.txt.json";

        if params['include_replicates'] ==0:
         scriptfile = "fpkmgenematrix.R"
         outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH,
                    self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token,
                    cuffdiff_dir, self.__WS_URL,workspace)


        else:
         scriptfile = "repfpkmgenematrix.R"
         outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH,
                    self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token,
                    cuffdiff_dir, self.__WS_URL,workspace)

        if outjson is False:
            self.__LOGGER.info("Creation of expression matrix failed")
            return returnVal
        with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et:
                  eo = json.load(et)
        eo['type']='untransformed'
        genome_ref = s_res[0]['data']['genome_id']
        #eo['genome_ref'] = genome_ref

        self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params['ws_expression_matrix_id'])
        ws_client.save_objects({'workspace' : workspace,
            'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix',
                           'data' : eo,
                           'name' : params['ws_expression_matrix_id']
                        }]})


        #END create_expression_matrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method create_expression_matrix return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
Beispiel #35
0
def run_coex_cluster(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_cluster2
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(CLSTR_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)
        #raise Exception(stderr)

    logger.info("Coexpression clustering analysis")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_cluster
    cmd_coex_cluster = [
        COEX_CLUSTER, '-t', 'y', '-i',
        "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(CLSTR_DIR, CLSTR_FN)
    ]

    for p in [
            'net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method',
            'minModuleSize', 'detectCutHeight'
    ]:
        if p in param:
            cmd_coex_cluster.append("--{0}".format(p))
            cmd_coex_cluster.append(str(param[p]))

    #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        if re.search(
                r'^There were \d+ warnings \(use warnings\(\) to see them\)',
                stderr):
            logger.info(stderr)
        else:
            logger.error(stderr)
            raise Exception(stderr)

    # build index for gene list
    pos_index = {
        expr['data']['row_ids'][i]: i
        for i in range(0, len(expr['data']['row_ids']))
    }

    # parse clustering results
    cid2genelist = {}
    with open("{0}/{1}".format(CLSTR_DIR, CLSTR_FN), 'r') as glh:
        glh.readline()  # skip header
        for line in glh:
            gene, cluster = line.replace('"', '').split("\t")
            if cluster not in cid2genelist:
                cid2genelist[cluster] = []
            cid2genelist[cluster].append(gene)

    if (len(cid2genelist) < 1):
        logger.error("Clustering failed")
        return empty_results("Error: No cluster output", expr,
                             workspace_service_url, param, logger, ws)
        #sys.exit(4)

    logger.info("Uploading the results onto WS")
    feature_clusters = []
    for cluster in cid2genelist:
        feature_clusters.append({
            "id_to_pos":
            {gene: pos_index[gene]
             for gene in cid2genelist[cluster]}
        })

    ## Upload Clusters
    feature_clusters = {
        "original_data":
        "{0}/{1}".format(param['workspace_name'], param['object_name']),
        "feature_clusters":
        feature_clusters
    }

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.FeatureClusters',
            'data': feature_clusters,
            'name': (param['out_object_name'])
        }]
    })
    def generate_cummerbund_plots(self, ctx, cummerbundParams):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plots

        params    = cummerbundParams
        returnVal = params['ws_cummerbund_output']

        #Set up workspace client
        user_token = ctx['token']
        ws_client  = Workspace(url=self.__WS_URL, token=user_token)

        #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name' : params['ws_cuffdiff_id'],
            'workspace' : params['workspace_name']
            }])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject=dict()

        # Prepare output plot list
        cummerbundplotset=[]

        # List of plots to generate
        plotlist = [
                { 'file': "dispersionplot.R",
                  'title': "Dispersion plot",
                  'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." },


                { 'file': "fpkmscvplot.R",
                  'title': "Genes CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." },

                { 'file': "isoformscvplot.R",
                  'title': "Isoform CV plot",
                  'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." },

                { 'file': "densityplot.R",
                  'title': "Density plot",
                  'description': "The density plot shows the distribution of FPKM scores across samples" },

                { 'file': "csdensityrepplot.R",
                  'title': "Replicates density plot",
                  'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" },

                { 'file': "boxplot.R",
                  'title': "Box plots",
                  'description': "The box plots show the FPKM distribution across samples." },

                { 'file': "boxrepplot.R",
                  'title': "Box plots of replicates",
                  'description': "The box plots of replicates show the FPKM distribution across sample replicates." },

                { 'file': "pairwisescatterplots.R",
                  'title': "Pairwise scatter plots",
                  'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." },

                 { 'file': "volcanomatrixplot.R",
                  'title': "Volcano matrix plots",
                  'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." },

                { 'file': "pcaplot.R",
                  'title': "PCA plot",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." },

                { 'file': "pcarepplot.R",
                  'title': "PCA plot including replicates",
                  'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." },

                { 'file': "mdsplot.R",
                  'title': "Multi-dimensional scaling plot",
                  'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " },

                { 'file': "mdsrepplot.R",
                  'title': "Multi-dimensional scaling plot including replicates",
                  'description': "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." }
            ]

#TODO.. Giving Rplot.pdf
#                { 'file': "dendrogramplot.R",
#                  'title': "Dendrogram",
#                  'description': "Dendrogram  based on the JS (Jensen-Shannon divergence) distance" },
#
#                { 'file': "dendrogramrepplot.R",
#                  'title': "Dendrogram including replicates",
#                  'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" },


        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token,
                cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])


        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        #TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace":params['workspace_name'],
            "objects": [{
                "type":"KBaseRNASeq.cummerbund_output",
                "data":outputobject,
                "name":params["ws_cummerbund_output"]}]
            })

        #END generate_cummerbund_plots

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plots return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]
Beispiel #37
0
def run_filter_genes(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    # force to use ANOVA if the number of sample is two
    if (ncol == 3): param['method'] = 'anova'

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_filter
    cmd_coex_filter = [
        COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s',
        "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x',
        "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'
    ]
    if 'num_features' in param:
        cmd_coex_filter.append("-n")
        cmd_coex_filter.append(str(param['num_features']))

    if 'p_value' in param:
        cmd_coex_filter.append("-p")
        cmd_coex_filter.append(str(param['p_value']))

    if 'p_value' not in param and 'num_features' not in param:
        logger.error("One of p_value or num_features must be defined")
        return empty_results("One of p_value or num_features must be defined",
                             expr, workspace_service_url, param, logger, ws)
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    try:
        with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
            fe = ff.readlines()
        with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
            ff.write(
                fl
            )  # use original first line that has correct header information
            fe.pop(0)
            ff.writelines(fe)
    except:
        logger.error("Output was not found")
        return empty_results("Increase p_value or specify num_features", expr,
                             workspace_service_url, param, logger, ws)

    ## checking genelist
    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh:
        gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    if (len(gl) < 1):
        logger.error("No genes are selected")
        return empty_results("Increase p_value or specify num_features", expr,
                             workspace_service_url, param, logger, ws)
        #sys.exit(4)

    ## Upload FVE
    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    # Updates: change missing genome handling strategy by copying reference to working workspace
    cmd_upload_expr = [
        TSV_2_FVE, '--workspace_service_url', workspace_service_url,
        '--object_name', param['out_expr_object_name'], '--working_directory',
        FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name',
        FINAL_FN
    ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        obj_infos = ws.get_object_info_new(
            {"objects": [{
                'ref': expr['genome_ref']
            }]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))

        #tmp_ws = "{0}".format(obj_infos[7])
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7],
                                              obj_infos[1]))
        if obj_infos[7] != param['workspace_name']:
            #we need to copy it from the other workspace
            try:
                logger.info(
                    "trying to copy the referenced genome object : {0}".format(
                        expr['genome_ref']))
                ws.copy_object({
                    'from': {
                        'ref': expr['genome_ref']
                    },
                    'to': {
                        'workspace': param['workspace_name'],
                        'name': obj_infos[1]
                    }
                })
                # add genome_object_name only after successful copy
                cmd_upload_expr.append('--genome_object_name')
                cmd_upload_expr.append(obj_infos[1])
            except:
                # no permission or any issues... then, give up providing genome reference
                logger.info("".join(traceback.format_exc()))
                pass
        else:
            # it is local... we can simply add reference without copying genome
            cmd_upload_expr.append('--genome_object_name')
            cmd_upload_expr.append(obj_infos[1])

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    logger.info(" ".join(cmd_upload_expr))

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et:
        eo = json.load(et)

    if 'description' not in expr:
        expr['description'] = "Filtered Expression Matrix"
    expr['description'] += " : Filtered by '{1}' method ".format(
        expr['description'], param['method'])

    if 'feature_mapping' in expr and 'feature_mapping' in eo:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.ExpressionMatrix',
            'data': expr,
            'name': (param['out_expr_object_name'])
        }]
    })

    ## Upload FeatureSet
    fs = {'elements': {}}
    fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(
        param['method'])

    fs['description'] += "from {0}/{1}".format(param['workspace_name'],
                                               param['object_name'])

    for g in gl:
        if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
        else:
            fs['elements'][g] = []

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseCollections.FeatureSet',
            'data': fs,
            'name': (param['out_fs_object_name'])
        }]
    })
    def const_coex_net_clust(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN const_coex_net_clust
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.CLSTR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass

        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)

        result = {}
        self.logger.info(
            "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV"
        )
        token = ctx['token']

        param = args

        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{
            'workspace': param['workspace_name'],
            'name': param['object_name']
        }])[0]['data']

        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token
        cmd_dowload_cvt_tsv = [
            self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL,
            '--workspace_name', param['workspace_name'], '--object_name',
            param['object_name'], '--working_directory', self.RAWEXPR_DIR,
            '--output_file_name', self.EXPRESS_FN
        ]

        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                        stderr=subprocess.PIPE,
                                        shell=True,
                                        env=eenv)
        stdout, stderr = tool_process.communicate()

        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)

        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
            #raise Exception(stderr)

        self.logger.info("Coexpression clustering analysis")

        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN),
                  'r') as f:
            fl = f.readline()
        ncol = len(fl.split('\t'))

        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                  'wt') as s:
            s.write("0")
            for j in range(1, ncol - 1):
                s.write("\t{0}".format(j))
            s.write("\n")

        ## Run coex_cluster
        cmd_coex_cluster = [
            self.COEX_CLUSTER, '-t', 'y', '-i',
            "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o',
            "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN)
        ]

        for p in [
                'net_method', 'minRsq', 'maxmediank', 'maxpower',
                'clust_method', 'minModuleSize', 'detectCutHeight'
        ]:
            if p in param:
                cmd_coex_cluster.append("--{0}".format(p))
                cmd_coex_cluster.append(str(param[p]))

        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination

        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)

        tool_process = subprocess.Popen(cmd_coex_cluster,
                                        stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()

        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)

        if stderr is not None and len(stderr) > 0:
            if re.search(
                    r'^There were \d+ warnings \(use warnings\(\) to see them\)',
                    stderr):
                self.logger.info(stderr)
            else:
                self.logger.error(stderr)
                raise Exception(stderr)

        # build index for gene list
        pos_index = {
            expr['data']['row_ids'][i]: i
            for i in range(0, len(expr['data']['row_ids']))
        }

        # parse clustering results
        cid2genelist = {}
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), 'r') as glh:
            glh.readline()  # skip header
            for line in glh:
                gene, cluster = line.replace('"', '').split("\t")
                if cluster not in cid2genelist:
                    cid2genelist[cluster] = []
                cid2genelist[cluster].append(gene)

        if (len(cid2genelist) < 1):
            self.logger.error("Clustering failed")
            return empty_results("Error: No cluster output", expr,
                                 self.__WS_URL, param, self.logger, ws)
            #sys.exit(4)

        self.logger.info("Uploading the results onto WS")
        feature_clusters = []
        for cluster in cid2genelist:
            feature_clusters.append({
                "id_to_pos":
                {gene: pos_index[gene]
                 for gene in cid2genelist[cluster]}
            })

        ## Upload Clusters
        feature_clusters = {
            "original_data":
            "{0}/{1}".format(param['workspace_name'], param['object_name']),
            "feature_clusters":
            feature_clusters
        }

        ws.save_objects({
            'workspace':
            param['workspace_name'],
            'objects': [{
                'type': 'KBaseFeatureValues.FeatureClusters',
                'data': feature_clusters,
                'name': (param['out_object_name'])
            }]
        })
        result = {
            'workspace_name': param['workspace_name'],
            'out_object_name': param['out_object_name']
        }
        #END const_coex_net_clust

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method const_coex_net_clust return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
def net_clust (args) :
    ###
    # download ws object and convert them to csv
    wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN'))
    lseries = wsd.get_object({'id' : args.inobj_id,
                  'type' : 'KBaseExpression.ExpressionSeries', 
                  'workspace' : args.ws_id})['data']

    if lseries is None:
        raise COEXException("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id))

    samples, sids, genome_id = {}, [], ""
    # assume only one genome id
    for gid in sorted(lseries['genome_expression_sample_ids_map'].keys()):
        genome_id = gid
        for samid in lseries['genome_expression_sample_ids_map'][gid]:
            sids.append({'ref': samid})
        samples = wsd.get_objects(sids)
        break

    cif = open(args.exp_fn, 'w')
    header = ",".join([s['data']['source_id'] for s in samples])
    cif.write(header + "\n")
    gids = samples[0]['data']['expression_levels'].keys()  # each sample has same gids
    for gid in sorted(gids):
        line = gid + ","
        line += ",".join([str(s['data']['expression_levels'][gid]) for s in samples])
        cif.write(line + "\n")
    cif.close()


    ###
    # generate network and cluster
    net_cmd_lst = ['coex_net', '-i', args.exp_fn]
    if (args.nmethod    is not None): 
        net_cmd_lst.append("-m")
        net_cmd_lst.append(args.nmethod)
    if (args.cut_off    is not None): 
        net_cmd_lst.append("-c")
        net_cmd_lst.append(args.cut_off)
    if (args.net_fn     is not None):
        net_cmd_lst.append("-o")
        net_cmd_lst.append(args.net_fn)
    p1 = Popen(net_cmd_lst, stdout=PIPE)
    out_str = p1.communicate()
    if out_str[0] is not None : print out_str[0]
    if out_str[1] is not None : print >> sys.stderr, out_str[1]
    net_cmd = " ".join(net_cmd_lst)
   
   
    clust_cmd_lst = ['coex_cluster2', '-i', args.exp_fn]
    if (args.cmethod    is not None):
        clust_cmd_lst.append("-c")
        clust_cmd_lst.append(args.cmethod)
    if (args.nmethod    is not None):
        clust_cmd_lst.append("-n")
        clust_cmd_lst.append(args.nmethod)
    if (args.k          is not None):
        clust_cmd_lst.append("-s")
        clust_cmd_lst.append(args.k)
    if (args.clust_fn   is not None):
        clust_cmd_lst.append("-o")
        clust_cmd_lst.append(args.clust_fn)
    p1 = Popen(clust_cmd_lst, stdout=PIPE)
    out_str = p1.communicate()
    if out_str[0] is not None : print out_str[0]
    if out_str[1] is not None : print >> sys.stderr, out_str[1]
    clust_cmd = " ".join(clust_cmd_lst)

   
    ###
    # Create network object
    #generate Networks datasets
    net_ds_id = args.inobj_id + ".net"
    clt_ds_id = args.inobj_id + ".clt"
 
    datasets = [
      {
        'network_type' : 'FUNCTIONAL_ASSOCIATION',
        'taxons' : [ genome_id ],
        'source_ref' : 'WORKSPACE',
        'name' : net_ds_id,
        'id' : clt_ds_id,
        'description' : "Coexpression network object of " + args.inobj_id,
        'properties' : {
          'original_data_type' : 'workspace',
          'original_ws_id' : args.ws_id,
          'original_obj_id' : args.inobj_id,
          'coex_net_cmd' : net_cmd
        }
      },
      {
        'network_type' : 'FUNCTIONAL_ASSOCIATION',
        'taxons' : [ genome_id ],
        'source_ref' : 'WORKSPACE',
        'name' : clt_ds_id,
        'id' : clt_ds_id,
        'description' : "Coexpression cluster object of " + args.inobj_id,
        'properties' : {
          'original_data_type' : 'workspace',
          'original_ws_id' : args.ws_id,
          'original_obj_id' : args.inobj_id,
          'coex_clust_cmd' : clust_cmd
        }
      }
    ]
 
 
    # process coex network file
    nc = Node()
 
    cnf = open(args.net_fn,'r');
    cnf.readline(); # skip header
    for line in cnf :
        line.strip();
        line = line.replace('"','')
        values = line.split(',')
        if values[0] != values[1] : nc.add_edge(float(values[2]), net_ds_id, values[0], 'GENE', values[1], 'GENE', 0.0) #we add edges meaningful
 
 
    # process coex cluster file
    cnf = open(args.clust_fn,'r')
    cnf.readline(); # skip header
    for line in cnf :
        line = line.strip();
        line = line.replace('"','')
        values = line.split(',')
        nc.add_edge(1.0, clt_ds_id, values[0], 'GENE', "cluster." + values[1], 'CLUSTER', 0.0)
 
    # generate Networks object
    net_object = {
      'datasets' : datasets,
      'nodes' : nc.nodes,
      'edges' : nc.edges,
      'user_annotations' : {},
      'name' : 'Coexpression Network',
      'id' : args.outobj_id,
      'properties' : {
        'graphType' : 'edu.uci.ics.jung.graph.SparseMultigraph'
      }
    }
 
    # Store results object into workspace
    wsd.save_objects({'workspace' : args.ws_id, 'objects' : [{'type' : 'KBaseNetworks.Network', 'data' : net_object, 'name' : args.outobj_id, 'meta' : {'org_obj_id' : args.inobj_id, 'org_ws_id' : args.ws_id}}]})
 
    if(args.del_tmps is "true") :
        os.remove(args.exp_fn)
        os.remove(args.net_fn)
        os.remove(args.clust_fn)
    def filter_BlastOutput(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_BlastOutput
        user_token=ctx['token']
        ws_client=Workspace(url=self.__WS_URL, token=user_token)
        blast_outputs=ws_client.get_objects([{'name':params['in_id'], 
                                              'workspace': params['ws_id']}])

            

        fs ={'elements': {}}
        fs['description'] = "FeatureSet from BlastOutput by "
        printedEvalue = False
        printedEntries = False
        if 'evalue' in params and params['evalue'] != "":
            fs['description'] += " E-value:{0}".format(params['evalue'])
            printedEvalue = True
        if 'entries' in params and (params['entries'] != "" or params['entries'] > 0):
            if(printedEvalue): fs['description'] += ","
            fs['description'] += " # of entries :{0}".format(params['entries'])
            printedEntries = True
        if not printedEvalue and not printedEntries:
            fs['description'] += "no filtering"
        
        if len(blast_outputs) != 1:
            fs['description'] = "No such blast output object was found : {0}/{1}".format(param['workspace_name'], param['object_name'])
        else:
            fm = {}
            f2g = {}
            for boid in blast_outputs[0]['data']['BlastOutput_iterations']['Iteration']:
                for hitd in boid['Iteration_hits']['Hit']:
                    print hitd['Hit_def']
                    ali = hitd['Hit_def'].find('#')
                    if(ali < 0): next
                    fid = hitd['Hit_def'][0:ali]
                    gri = hitd['Hit_def'].find('#', ali+1)
                    if fid not in f2g: f2g[fid] = {}
                    if (gri >=  0 and not gri == (ali+1)): 
                        grid = hitd['Hit_def'][(ali+1):gri]
                        f2g[fid][grid] = 1
                    for hspd in hitd['Hit_hsps']['Hsp']:
                        if fid in fm:
                            if float(hspd['Hsp_evalue']) < fm[fid]:
                                fm[fid] = float(hspd['Hsp_evalue'])
                        else: fm[fid] = float(hspd['Hsp_evalue'])
           
            fms = sorted(fm.items(), key=lambda x: x[1], reverse=False)
            bol = len(fms)
            if params['entries'] != "" or int(params['entries']) > 0:
                if(int(params['entries']) < bol):
                    bol = int(params['entries'])
            for i in range(bol):
                if(fms[i][1] > float(params['evalue'])): break
                if fms[i][0] in f2g:
                    fs['elements'][fms[i][0]] = f2g[fms[i][0]].keys()
                else:
                    fs['elements'][fms[i][0]] = []

        ws_client.save_objects(
            {"workspace":params['ws_id'],
            "objects": [{
                "type":"KBaseCollections.FeatureSet",
                "data":fs,
                "name":params['out_id']}
            ]})

        #pprint(fs)
        returnVal = {'obj_name' : params['out_id'], 'ws_id' : params['ws_id']}

        #END filter_BlastOutput

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_BlastOutput return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Beispiel #41
0
    def diff_p_distribution(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN diff_p_distribution
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
 
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
           '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN]
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## loading pvalue distribution FDT
        pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        pvfdt = OrderedDict(pvfdt)
        with open(self.PVFDT_FN, 'r') as myfile:
           pvfdt = json.load(myfile)
        data_obj_name = "{0}.fdt".format(param['out_figure_object_name'])
        pvfdt['id'] = data_obj_name
 
 
        fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"}
        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : pvfdt,
                                                                              'name' : data_obj_name}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END diff_p_distribution

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method diff_p_distribution return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def filter_genes(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN filter_genes
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y']
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
        if 'p_value' not in param and 'num_features' not in param:
          self.logger.error("One of p_value or num_features must be defined");
          return empty_results("One of p_value or num_features must be defined", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## Header correction
        try:
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'r') as ff:
                fe = ff.readlines()
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'w') as ff:
                ff.write(fl) # use original first line that has correct header information
                fe.pop(0)
                ff.writelines(fe)
        except:
            self.logger.error("Output was not found");
            return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
            
        
        ## checking genelist
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh:
          gl = glh.readlines()
        gl = [x.strip('\n') for x in gl]
 
        if(len(gl) < 1) :
          self.logger.error("No genes are selected")
          return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(4)
 
        ## Upload FVE
        # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
        # Updates: change missing genome handling strategy by copying reference to working workspace
        cmd_upload_expr = [self.TSV_2_FVE, '--workspace_service_url', self.__WS_URL, 
                                          '--object_name', param['out_expr_object_name'],
                                          '--working_directory', self.FINAL_DIR,
                                          '--input_directory', self.FLTRD_DIR,
                                          '--output_file_name', self.FINAL_FN
                              ]
        tmp_ws = param['workspace_name']
        if 'genome_ref' in expr:
            obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]
 
            if len(obj_infos) < 1:
                self.logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
                raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))
 
            #tmp_ws = "{0}".format(obj_infos[7])
            self.logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1]))
            if obj_infos[7] != param['workspace_name']:
                #we need to copy it from the other workspace
                try:
                  self.logger.info("trying to copy the referenced genome object : {0}".format(expr['genome_ref']))
                  ws.copy_object({'from' : {'ref' : expr['genome_ref']},'to' : {'workspace': param['workspace_name'], 'name' : obj_infos[1]}})
                  # add genome_object_name only after successful copy
                  cmd_upload_expr.append('--genome_object_name')
                  cmd_upload_expr.append(obj_infos[1])
                except:
                  # no permission or any issues... then, give up providing genome reference
                  self.logger.info("".join(traceback.format_exc()))
                  pass
            else:
                # it is local... we can simply add reference without copying genome
                cmd_upload_expr.append('--genome_object_name')
                cmd_upload_expr.append(obj_infos[1])
 
        # updated ws name
        cmd_upload_expr.append('--workspace_name')
        cmd_upload_expr.append(tmp_ws)
 
        self.logger.info(" ".join(cmd_upload_expr))
 
        tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        
        with open("{0}/{1}".format(self.FINAL_DIR,self.FINAL_FN),'r') as et:
          eo = json.load(et)
 
        if 'description' not in expr: 
            expr['description'] = "Filtered Expression Matrix"
        expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method'])
 
        if 'feature_mapping' in expr and 'feature_mapping' in eo:
            expr['feature_mapping'] = eo['feature_mapping']
        expr['data'] = eo['data']
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                              'data' : expr,
                                                                              'name' : (param['out_expr_object_name'])}]})
 
        ## Upload FeatureSet
        fs ={'elements': {}}
        fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method'])
 
        fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name'])
 
        for g in gl:
          if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
          else:
            fs['elements'][g] = []
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                              'data' : fs,
                                                                              'name' : (param['out_fs_object_name'])}]})
        result = {'workspace_name' : param['workspace_name'], 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']}
        #END filter_genes

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method filter_genes return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Beispiel #43
0
    def filter_genes(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN filter_genes
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
        provenance = [{}]
        if 'provenance' in ctx:
                provenance = ctx['provenance']
        provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']]
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y']
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
        if 'p_value' not in param and 'num_features' not in param:
          self.logger.error("One of p_value or num_features must be defined");
          return error_report("One of p_value or num_features must be defined", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## checking genelist
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh:
          gl = glh.readlines()
        gl = [x.strip('\n') for x in gl]
 
        if(len(gl) < 1) :
          self.logger.error("No genes are selected")
          return error_report("Increase p_value or specify num_features", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(4)
 
        ## Upload FVE
        if 'description' not in expr: 
            expr['description'] = "Filtered Expression Matrix"
        expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method'])
 
        expr = self._subselectExp(expr, gl)
 
        ex_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                              'data' : expr,
                                                                              'name' : (param['out_expr_object_name'])}]})[0]
 
        ## Upload FeatureSet
        fs ={'elements': {}}
        fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method'])
 
        fs['description'] += "from {0}/{1}".format(workspace_name, param['object_name'])
 
        for g in gl:
          if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
          else:
            fs['elements'][g] = []
 
        fs_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                              'data' : fs,
                                                                              'name' : (param['out_fs_object_name'])}]})[0]

        ## Create report object:
	report = "Filtering expression matrix using {0} on {1}".format(param['method'],param['object_name'])
        reportObj = {
                        'objects_created':[{
                                'ref':"{0}/{1}/{2}".format(fs_info[6], fs_info[0], fs_info[4]),
                                'description':'Filtered FeatureSet' },
                             {
                                'ref':"{0}/{1}/{2}".format(ex_info[6], ex_info[0], ex_info[4]),
                                'description':'Filetered ExpressionMatrix' 
                             }],
                        'text_message':report
                    }

        # generate a unique name for the Method report
        reportName = 'FilterExpression_'+str(hex(uuid.getnode()))
        report_info = ws.save_objects({
                                        'id':ex_info[6],
                                        'objects':[
                                        {
                                        'type':'KBaseReport.Report',
                                        'data':reportObj,
                                        'name':reportName,
                                        'meta':{},
                                        'hidden':1, 
                                        'provenance':provenance
                                        }
                                        ]
                                        })[0]

        result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) }



        #result = {'workspace_name' : workspace_name, 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']}
        #END filter_genes

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method filter_genes return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    def view_heatmap(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN view_heatmap
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Loading data")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        fc = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
        if 'original_data' not in fc:
            raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix")
        oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0]

        df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids'])
#        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
#                                          '--workspace_name', oexpr['info'][7],
#                                          '--object_name', oexpr['info'][1],
#                                          '--working_directory', self.RAWEXPR_DIR,
#                                          '--output_file_name', self.EXPRESS_FN
#                              ]
# 
#        # need shell in this case because the java code is depending on finding the KBase token in the environment
#        #  -- copied from FVE_2_TSV
#        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
#        stdout, stderr = tool_process.communicate()
#        
#        if stdout is not None and len(stdout) > 0:
#            self.logger.info(stdout)
# 
#        if stderr is not None and len(stderr) > 0:
#            self.logger.info(stderr)
# 
#        df = pd.read_csv("{0}/{1}".format(self.RAWEXPR_DIR,self.EXPRESS_FN), sep='\t')
#        df2 = df[df.columns[1:]]
#        rn = df[df.columns[0]]
#        df2.index = rn

        # L2 normalization
        df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0)

        # type - ? level, ratio, log-ratio  <---> "untransformed"
        # scale - ? probably: raw, ln, log2, log10
        self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] ))
        if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
        elif oexpr['data']['type'] == 'ratio':
            fc_cf = df2.apply(np.log2)
        elif oexpr['data']['type'] == 'log-ratio':
            fc_cf = df2
            if oexpr['data']['scale'] == "log10":
                fc_df = fc_df/np.log10(2)
            elif oexpr['data']['scale'] == "ln":
                fc_df = fc_df/np.log(2)
            else:
                pass

        else: # do the same thing with simple level or untransformed
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
       
        self.logger.info("Compute cluster statistics")

        cl = {}
        afs = [];
        cid = 1;

        c_stat = pd.DataFrame()
        for cluster in fc['feature_clusters']:
         
          try: 
            fs  = cluster['id_to_pos'].keys()
          except:
            continue # couldn't find feature_set

          fsn = "Cluster_{0}".format(cid)
          cid +=1
          c_stat.loc[fsn,'size'] = len(fs)
          if 'meancor' in cluster:
              c_stat.loc[fsn,'mcor'] = cluster['meancor']
          else:
            pass
            # TODO: Add mean cor calculation later
            #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN

          if 'quantile' in param:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(float(param['quantile']))
          else:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75)
         

          c1 = df3.loc[fs,].sum(axis=0)
          if df3.loc[fs,].shape[0] < 1: # empty
            continue
          cl[fsn] = fs
          #afs.extend(fs)

          #c1 = df3.loc[fs,].sum(axis=0)
          #c1 = c1 / np.sqrt(c1.pow(2).sum())
          #if(len(cl.keys()) == 1):
          #  centroids = c1.to_frame(fsn).T
          #else:
          #  centroids.loc[fsn] = c1

        # now we have centroids and statistics
        # let's subselect clusters
        min_features = 200
        if 'min_features' in param :
          min_features = param['min_features']
        
        c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max()
        c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max()
        
        if 'use_norm_weight' in param and param['use_norm_weight'] != 0:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0                             * c_stat.loc[:,'nstdstat']
        else:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1                             * c_stat.loc[:,'stdstat']

        c_stat.sort_values('weight', inplace=True, ascending=False)

        pprint(c_stat)

        centroids = pd.DataFrame()
        for i in range(c_stat.shape[0]):
            fsn = c_stat.index[i]
            fs = cl[fsn]
            if i != 0 and len(afs) + len(fs) > min_features :
                break;
           
            afs.extend(fs)

            c1 = df3.loc[fs,].sum(axis=0)
            c1 = c1 / np.sqrt(c1.pow(2).sum())
            if(centroids.shape[0] < 1):
              centroids = c1.to_frame(fsn).T
            else:
              centroids.loc[fsn] = c1
           
        pprint(centroids)
        
        if len(cl.keys()) == 0:
            raise Exception("No feature ids were mapped to dataset or no clusters were selected")
        
        # dataset centroid
        dc = df3.loc[afs,].sum(axis=0)
        dc = dc / np.sqrt(dc.pow(2).sum())
    
        
        self.logger.info("Ordering Centroids and Data")
        # the most far away cluster centroid from dataset centroid
        fc = (centroids * dc).sum(axis=1).idxmin()
        # the most far away centroid centroid from fc
        ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin()
        
        # major direction to order on unit ball space
        md = centroids.loc[ffc,] - centroids.loc[fc,]
        
        # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all)
        corder = (centroids * md).sum(axis=1).sort_values() # cluster order
        coidx = corder.index
        
        dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order
        
        # get first fs table    
        fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []}
        fig_properties['ygtick_labels'] = coidx.tolist()

        if 'fold_change' in param and param['fold_change'] == 1:
            frange = 2
            if 'fold_change_range' in param:
                frange = float(param['fold_change_range'])
            final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)

            if 'fold_cutoff' in param and param['fold_cutoff'] == 1:
                final[final > frange] = frange
                final[final < - frange] = - frange
            else:
                fc_df0b = final.sub(final.min(axis=1), axis=0)
                final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange
        else:
            final=df2.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = df2.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)
        
        ## loading pvalue distribution FDT
        fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        #fdt = OrderedDict(fdt)
        fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose
        fdt['row_labels'] = final.columns.tolist()
        fdt['column_labels'] = final.index.tolist()
        # TODO: Add group label later
        fdt['id'] = "{0}.fdt".format(param['out_figure_object_name'])
 
        self.logger.info("Saving the results")
        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : fdt,
                                                                              'name' : "{0}.fdt".format(param['out_figure_object_name'])}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END view_heatmap

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method view_heatmap return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Beispiel #45
0
    def const_coex_net_clust(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN const_coex_net_clust
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.CLSTR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)

        provenance = [{}]
        if 'provenance' in ctx:
                provenance = ctx['provenance']
        provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']]
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # grouping information 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_cluster
        cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y',
                           '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 
                           '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ]
 
        for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']:
           if p in param:
             cmd_coex_cluster.append("--{0}".format(p))
             cmd_coex_cluster.append(str(param[p]))
  
 
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr):
              self.logger.info(stderr)
            else:
              self.logger.error(stderr)
              raise Exception(stderr)
 
        
        # build index for gene list
        pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))}
 
 
        # parse clustering results
        cid2genelist = {}
        cid2stat = {}
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                cluster, mcor, msec = line.rstrip().replace('"','').split("\t")
                cid2stat[cluster]= [mcor, msec]
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                gene, cluster = line.rstrip().replace('"','').split("\t")
                if cluster not in cid2genelist:
                    cid2genelist[cluster] = []
                cid2genelist[cluster].append(gene)
 
        if(len(cid2genelist) < 1) :
          self.logger.error("Clustering failed")
          return error_report("Error: No cluster output", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(4)
 
        self.logger.info("Uploading the results onto WS")
        feature_clusters = []
        for cluster in cid2genelist:
            feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}})

        ## Upload Clusters
        feature_clusters ={"original_data": "{0}/{1}".format(workspace_name,param['object_name']),
                           "feature_clusters": feature_clusters}
 
        cl_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters',
                                                                          'data' : feature_clusters,
                                                                          'name' : (param['out_object_name'])}]})[0]
        ## Create report object:
	report = "Clustering expression matrix using WGCNA on {0}".format(param['object_name'])
        reportObj = {
                        'objects_created':[                             {
                                'ref':"{0}/{1}/{2}".format(cl_info[6], cl_info[0], cl_info[4]),
                                'description':'WGCNA FeatureClusters' 
                             }],
                        'text_message':report
                    }

        # generate a unique name for the Method report
        reportName = 'WGCNA_Clusters_'+str(hex(uuid.getnode()))
        report_info = ws.save_objects({
                                        'id':cl_info[6],
                                        'objects':[
                                        {
                                        'type':'KBaseReport.Report',
                                        'data':reportObj,
                                        'name':reportName,
                                        'meta':{},
                                        'hidden':1, 
                                        'provenance':provenance
                                        }
                                        ]
                                        })[0]

        result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) }
        #result = {'workspace_name' : workspace_name, 'out_object_name' : param['out_object_name']}
        #result = {'workspace' : workspace_name, 'output' : param['out_object_name']}
        #END const_coex_net_clust

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method const_coex_net_clust return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
def filter_expression (args) :
    ###
    # download ws object and convert them to csv
    wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN'))
    lseries = wsd.get_object({'id' : args.inobj_id,
                  'type' : 'KBaseExpression.ExpressionSeries', 
                  'workspace' : args.ws_id})['data']

    if lseries is None:
        raise COEXException("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id))

    samples, sids, genome_id = {}, [], ""
    # assume only one genome id
    for gid in sorted(lseries['genome_expression_sample_ids_map'].keys()):
        genome_id = gid
        for samid in lseries['genome_expression_sample_ids_map'][gid]:
            sids.append({'ref': samid})
        samples = wsd.get_objects(sids)
        break

    cif = open(args.exp_fn, 'w')
    header = ",".join([s['data']['source_id'] for s in samples])
    cif.write(header + "\n")

    # find common gene list
    gids = set(samples[0]['data']['expression_levels'].keys())  # each sample has same gids
    for s in samples:
        gids = gids.intersection(set(s['data']['expression_levels'].keys()))
    for gid in sorted(gids):
        line = gid + ","
        line += ",".join([str(s['data']['expression_levels'][gid]) for s in samples])
        cif.write(line + "\n")
    cif.close()

    sif = open(args.rp_smp_fn, 'w')
    sample = ",".join(map(str, range(len(samples))))
    sif.write(sample + "\n")
    sif.close()

    ###
    # execute filtering
    flt_cmd_lst = ['coex_filter', "-i", args.exp_fn]
    if (args.method     is not None): 
        flt_cmd_lst.append('-m')
        flt_cmd_lst.append(args.method)
    if (args.p_value    is not None): 
        flt_cmd_lst.append('-p')
        flt_cmd_lst.append(args.p_value)
    if (args.num_genes  is not None): 
        flt_cmd_lst.append('-n')
        flt_cmd_lst.append(args.num_genes)
    if (args.flt_out_fn is not None): 
        flt_cmd_lst.append('-o')
        flt_cmd_lst.append(args.flt_out_fn)
    if (args.rp_smp_fn  is not None): 
        flt_cmd_lst.append('-s')
        flt_cmd_lst.append(args.rp_smp_fn)

    p1 = Popen(flt_cmd_lst, stdout=PIPE)
    out_str = p1.communicate()
    # print output message for error tracking
    if out_str[0] is not None : print out_str[0]
    if out_str[1] is not None : print >> sys.stderr, out_str[1]
    flt_cmd = " ".join(flt_cmd_lst)
   
    ###
    # put it back to workspace
    elm = {};
    fif = open(args.flt_out_fn, 'r')
    fif.readline(); # skip header
    
    nsamples = len(samples)
    for i in range(nsamples): elm[i] = {}
    
    for line in fif :
        line.strip();
        values = line.split(',')
        gene_id = values[0].replace("\"", "")
        for i in range(nsamples): elm[i][gene_id] = float(values[i + 1])
 
    data_list = [];
    sid_list =[];
    for i in range(nsamples) :
        samples[i]['data']['expression_levels'] = elm[i]
        if samples[i]['data']['title'] is None: samples[i]['data']['title'] = " Filtered by coex-filter-genes" 
        else : samples[i]['data']['title'] += " filtered by coex-filter-genes"
        if samples[i]['data']['description'] is None : samples[i]['data']['description'] = "Generated by " + flt_cmd
        else : samples[i]['data']['description'] += " Generated by " + flt_cmd
        samples[i]['data']['id']+=".filtered";
        samples[i]['data']['source_id']+=".filtered";
        data_list.append({'type' : 'KBaseExpression.ExpressionSample', 'data' : samples[i]['data'], 'name' : samples[i]['data']['id']})
    sv_rst = wsd.save_objects({'workspace' : args.ws_id, 'objects' : data_list})
    for i in range(nsamples):sid_list.append(str(sv_rst[i][6]) + "/" + str(sv_rst[i][0]) + "/" + str(sv_rst[i][4]))
 
    data_list = [];
    # assume only one genome id
    lseries['genome_expression_sample_ids_map'][genome_id] = sid_list
    lseries['title'] += " filtered by coex_filter for " + genome_id
    lseries['source_id'] += ".filtered"
    lseries['id'] = args.outobj_id
    data_list.append({'type' : 'KBaseExpression.ExpressionSeries', 'data' : lseries, 'name' : lseries['id'], 'meta' : {'org.series' : args.inobj_id}})
    wsd.save_objects({'workspace' : args.ws_id, 'objects' : data_list})

    if(args.del_tmps is "true") :
        os.remove(args.exp_fn)
        os.remove(args.rp_smp_fn)
        os.remove(args.flt_out_fn)
Beispiel #47
0
    def view_heatmap(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN view_heatmap
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Loading data")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        fc = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
        if 'original_data' not in fc:
            raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix")
        oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0]

        df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids'])

        # L2 normalization
        df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0)

        # type - ? level, ratio, log-ratio  <---> "untransformed"
        # scale - ? probably: raw, ln, log2, log10
        self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] ))
        # do default behavior
        factor = 0.125
        fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
        if param['control_condition']  in fc_df.columns:
            fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
        else:
            fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
        # now fc_df will be reset
        if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
        elif oexpr['data']['type'] == 'ratio':
            fc_df = df2.apply(np.log2)
        elif oexpr['data']['type'] == 'log-ratio':
            fc_df = df2
            if oexpr['data']['scale'] == "log10":
                fc_df = fc_df/np.log10(2)
            elif oexpr['data']['scale'] == "ln":
                fc_df = fc_df/np.log(2)
            else:
                pass

        else: # do the same thing with simple level or untransformed
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
       
        self.logger.info("Compute cluster statistics")

        cl = {}
        afs = [];
        cid = 1;

        c_stat = pd.DataFrame()
        for cluster in fc['feature_clusters']:
         
          try: 
            fs  = cluster['id_to_pos'].keys()
          except:
            continue # couldn't find feature_set

          fsn = "Cluster_{0}".format(cid)
          cid +=1
          c_stat.loc[fsn,'size'] = len(fs)
          if 'meancor' in cluster:
              c_stat.loc[fsn,'mcor'] = cluster['meancor']
          else:
            pass
            # TODO: Add mean cor calculation later
            #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN

          if 'quantile' in param:
              # enforcing quantile to be in [0 .. 1] rnage
              qt = float(param['quantile'])
              if qt > 1.0: qt = 1.0
              if qt < 0.0: qt = 0.0
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(qt)
          else:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75)
         

          c1 = df3.loc[fs,].sum(axis=0)
          if df3.loc[fs,].shape[0] < 1: # empty
            continue
          cl[fsn] = fs
          #afs.extend(fs)

          #c1 = df3.loc[fs,].sum(axis=0)
          #c1 = c1 / np.sqrt(c1.pow(2).sum())
          #if(len(cl.keys()) == 1):
          #  centroids = c1.to_frame(fsn).T
          #else:
          #  centroids.loc[fsn] = c1

        # now we have centroids and statistics
        # let's subselect clusters
        min_features = 200
        if 'min_features' in param :
          min_features = param['min_features']
        
        c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max()
        c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max()
        
        if 'use_norm_weight' in param and param['use_norm_weight'] != 0:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0                             * c_stat.loc[:,'nstdstat']
        else:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1                             * c_stat.loc[:,'stdstat']

        c_stat.sort_values('weight', inplace=True, ascending=False)

        pprint(c_stat)

        centroids = pd.DataFrame()
        for i in range(c_stat.shape[0]):
            fsn = c_stat.index[i]
            fs = cl[fsn]
            if i != 0 and len(afs) + len(fs) > min_features :
                break;
           
            afs.extend(fs)

            c1 = df3.loc[fs,].sum(axis=0)
            c1 = c1 / np.sqrt(c1.pow(2).sum())
            if(centroids.shape[0] < 1):
              centroids = c1.to_frame(fsn).T
            else:
              centroids.loc[fsn] = c1
           
        pprint(centroids)
        
        if len(cl.keys()) == 0:
            raise Exception("No feature ids were mapped to dataset or no clusters were selected")
        
        # dataset centroid
        dc = df3.loc[afs,].sum(axis=0)
        dc = dc / np.sqrt(dc.pow(2).sum())
    
        
        self.logger.info("Ordering Centroids and Data")
        # the most far away cluster centroid from dataset centroid
        fc = (centroids * dc).sum(axis=1).idxmin()
        # the most far away centroid centroid from fc
        ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin()
        
        # major direction to order on unit ball space
        md = centroids.loc[ffc,] - centroids.loc[fc,]
        
        # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all)
        corder = (centroids * md).sum(axis=1).sort_values() # cluster order
        coidx = corder.index
        
        dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order
        
        # get first fs table    
        fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []}
        fig_properties['ygtick_labels'] = coidx.tolist()

        if 'fold_change' in param and param['fold_change'] == 1:
            frange = 2
            if 'fold_change_range' in param:
                frange = float(param['fold_change_range'])
            final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)

            if 'fold_cutoff' in param and param['fold_cutoff'] == 1:
                final[final > frange] = frange
                final[final < - frange] = - frange
            else:
                fc_df0b = final.sub(final.min(axis=1), axis=0)
                final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange
        else:
            final=df2.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = df2.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)
        
        ## loading pvalue distribution FDT
        fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        #fdt = OrderedDict(fdt)
        # Nan to None
        final = final.where(pd.notnull(final),None)
        fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose
        fdt['row_labels'] = final.columns.tolist()
        fdt['column_labels'] = final.index.tolist()
        # TODO: Add group label later
        fdt['id'] = "{0}.fdt".format(param['out_figure_object_name'])
 
        self.logger.info("Saving the results")
        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : fdt,
                                                                              'hidden':1, 
                                                                              'name' : "{0}.fdt".format(param['out_figure_object_name'])}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              #'hidden':1, 
                                                                              'name' : "{0}".format(param['out_figure_object_name'])}]})
                                                                              #'name' : "{0}.fp".format(param['out_figure_object_name'])}]})

        #mchp = {}
        #mchp['figure_obj'] = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        #sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.MulticlusterHeatmapPlot',
        #                                                                      'data' : mchp,
        #                                                                      'name' : (param['out_figure_object_name'])}]})

        result = fig_properties
        #END view_heatmap

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method view_heatmap return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Beispiel #48
0
    def calculate(self, ctx, input):
        # ctx is the context object
        # return variables are: output
        #BEGIN calculate
        ''' Compute reaction probabilities from a probabilistic annotation.

            The input dictionary must contain the following keys:
            probanno: Name of ProbAnno object to input
            probanno_workspace: Workspace from which to grab the ProbAnno object
            rxnprobs: Name of RxnProbs object
            rxnprobs_workspace: Workspace to which to save the RxnProbs object

            The following keys are optional:
            verbose: Print lots of messages on the progress of the algorithm
            template_model: Name of TemplateModel object
            template_workspace: Workspace from which to grab TemplateModel object

            @param ctx Current context object
            @param input Dictionary with input parameters for function
            @return Object info for RxnProbs object
            @raise WrongVersionError when ProbAnno object version number is invalid
            @raise ValueError when template_workspace input argument is not specified
        '''

        # Sanity check on input arguments
        input = self._checkInputArguments(ctx, input, 
                                          ["probanno", "probanno_workspace", "rxnprobs", "rxnprobs_workspace"], 
                                          { "verbose" : False ,
                                            "template_model" : None,
                                            "template_workspace" : None
                                          }
                                         )

        # Make sure the static database files are ready.
        self._checkDatabaseFiles(ctx)

        # Set log level to INFO when verbose parameter is enabled.
        if input['verbose']:
            ctx.set_log_level(log.DEBUG)
        
        # Create a workspace client.
        wsClient = Workspace(self.config["workspace_url"], token=ctx['token'])
        
        # Get the ProbAnno object from the specified workspace.
        probannoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"])
        objectList = wsClient.get_objects( [ probannoObjectId ] )
        probannoObject = objectList[0]
        if probannoObject['info'][2] != ProbAnnoType:
            message = "ProbAnno object type %s is not %s for object %s" %(probannoObject['info'][2], ProbAnnoType, probannoObject['info'][1])
            ctx.log_err(message)
            raise WrongVersionError(message)
        genome = probannoObject["data"]["genome"]
        
        # Create a temporary directory for storing intermediate files when debug is turned on.
        if ctx.get_log_level() >= log.DEBUG2:
            workFolder = tempfile.mkdtemp("", "calculate-%s-" %(genome), self.config["work_folder_path"])
            ctx.log_debug('Intermediate files saved in '+workFolder)
        else:
            workFolder = None

        # When a template model is specified, use it to build dictionaries for roles,
        # complexes, and reactions instead of retrieving from static database files.
        complexesToRoles = None
        reactionsToComplexes = None
        if input["template_model"] is not None or input["template_workspace"] is not None:
            if not(input["template_model"] is not None and input["template_workspace"] is not None) :
                message = "Template model workspace is required if template model ID is provided"
                ctx.log_err(message)
                raise ValueError(message)

            # Create a dictionary to map a complex to a list of roles and a dictionary
            # to map a reaction to a list of complexes.  The dictionaries are specific to
            # the specified template model instead of covering everything in the central
            # data model.
            complexesToRoles = dict()
            reactionsToComplexes = dict()

            # Get the list of RoleComplexReactions for the template model from the
            # fba modeling service.  The RoleComplexReactions structure has a list
            # of ComplexReactions structures for the given role.  And each ComplexReactions
            # structure has a list of reactions for the given complex.
            fbaClient = fbaModelServices(self.config['fbamodeling_url'], token=ctx['token'])
            roleComplexReactionsList = fbaClient.role_to_reactions( { 'templateModel': input['template_model'], 'workspace': input['template_workspace'] } )

            # Build the two dictionaries from the returned list.
            for rcr in roleComplexReactionsList:
                for complex in rcr['complexes']:
                    complexId = re.sub(r'cpx0*(\d+)', r'kb|cpx.\1', complex['name']) # Convert ModelSEED format to KBase format
                    if complexId in complexesToRoles:
                        complexesToRoles[complexId].append(rcr['name'])
                    else:
                        complexesToRoles[complexId] = [ rcr['name'] ]
                    for reaction in complex['reactions']:
                        reactionId = reaction['reaction']
                        if reactionId in reactionsToComplexes:
                            reactionsToComplexes[reactionId].append(complexId)
                        else:
                            reactionsToComplexes[reactionId] = [ complexId ]

        # Calculate per-gene role probabilities.
        roleProbs = self._rolesetProbabilitiesToRoleProbabilities(ctx, input, genome, probannoObject["data"]["roleset_probabilities"], workFolder)

        # Calculate whole cell role probabilities.
        # Note - eventually workFolder will be replaced with a rolesToReactions call
        totalRoleProbs = self._totalRoleProbabilities(ctx, input, genome, roleProbs, workFolder)

        # Calculate complex probabilities.
        complexProbs = self._complexProbabilities(ctx, input, genome, totalRoleProbs, workFolder, complexesToRequiredRoles = complexesToRoles)

        # Calculate reaction probabilities.
        reactionProbs = self._reactionProbabilities(ctx, input, genome, complexProbs, workFolder, rxnsToComplexes = reactionsToComplexes)

        # If the reaction probabilities were not calculated using the data from the fba modeling service
        # via the template model, we need to convert from the KBase ID format to the ModelSEED format.
        if input["template_model"] is None:
            reactionList = list()
            for index in range(len(reactionProbs)):
                reactionList.append(reactionProbs[index][0])
            EntityAPI = CDMI_EntityAPI(self.config["cdmi_url"])
            numAttempts = 4
            while numAttempts > 0:
                try:
                    numAttempts -= 1
                    reactionData = EntityAPI.get_entity_Reaction( reactionList, [ "source_id" ] )
                    if len(reactionList) == len(reactionData):
                        numAttempts = 0
                except HTTPError as e:
                    pass
            for index in range(len(reactionProbs)):
                rxnId = reactionProbs[index][0]
                reactionProbs[index][0] = reactionData[rxnId]['source_id']
 
        # Create a reaction probability object
        objectData = dict()
        objectData["genome"] = probannoObject["data"]["genome"]
        objectData['genome_workspace'] = probannoObject['data']['genome_workspace']
        if input["template_model"] is None:
            objectData['template_model'] = 'None'
        else:
            objectData["template_model"] = input["template_model"]
        if input["template_workspace"] is None:
            objectData['template_workspace'] = 'None'
        else:
            objectData["template_workspace"] = input["template_workspace"]
        objectData["probanno"] = input['probanno']
        objectData['probanno_workspace'] = input['probanno_workspace']
        objectData["id"] = input["rxnprobs"]
        objectData["reaction_probabilities"] = reactionProbs

        objectMetaData = { "num_reaction_probs": len(objectData["reaction_probabilities"]) }
        objectProvData = dict()
        objectProvData['time'] = timestamp(0)
        objectProvData['service'] = os.environ['KB_SERVICE_NAME']
        objectProvData['service_ver'] = ServiceVersion
        objectProvData['method'] = 'calculate'
        objectProvData['method_params'] = input.items()
        objectProvData['input_ws_objects'] = [ '%s/%s/%d' %(probannoObject['info'][7], probannoObject['info'][1], probannoObject['info'][4]) ]
        objectSaveData = dict();
        objectSaveData['type'] = RxnProbsType
        objectSaveData['name'] = input["rxnprobs"]
        objectSaveData['data'] = objectData
        objectSaveData['meta'] = objectMetaData
        objectSaveData['provenance'] = [ objectProvData ]
        objectInfo = wsClient.save_objects( { 'workspace': input["rxnprobs_workspace"], 'objects': [ objectSaveData ] } )
        output = objectInfo[0]
        
        #END calculate

        # At some point might do deeper type checking...
        if not isinstance(output, list):
            raise ValueError('Method calculate return value ' +
                             'output is not type list as required.')
        # return the results
        return [output]
    def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams):
        """
        :param heatmapParams: instance of type "heatmapParams" -> structure:
           parameter "sample1" of String, parameter "sample2" of String,
           parameter "q_value_cutoff" of Double, parameter
           "log2_fold_change_cutoff" of Double, parameter "num_genes" of
           Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_expression_matrix_id1" of type "ws_expression_matrix_id" (@id
           ws KBaseFeatureValues.ExpressionMatrix), parameter
           "ws_expression_matrix_id2" of type "ws_expression_matrix_id" (@id
           ws KBaseFeatureValues.ExpressionMatrix), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        :returns: instance of type "ResultsToReport" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        # BEGIN create_interactive_heatmap_de_genes_old
        fparams = heatmapParams
        # returnVal = "ttt"
        # Set up workspace client
        user_token = ctx["token"]
        workspace = fparams["workspace_name"]
        ws_client = Workspace(url=self.__WS_URL, token=user_token)
        system_params = {}
        system_params["token"] = user_token
        system_params["ws_url"] = self.__WS_URL
        system_params["logger"] = self.__LOGGER
        system_params["shock_url"] = self.__SHOCK_URL
        system_params["hs_url"] = self.__HS_URL
        system_params["scratch"] = self.__SCRATCH
        system_params["rscripts"] = self.__RSCRIPTS
        system_params["workspace"] = workspace

        # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{"name": fparams["ws_cuffdiff_id"], "workspace": fparams["workspace_name"]}])

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal
        cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff")
        cuffdiff_dir = script_util2.extract_cuffdiff_data(
            self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token
        )
        # cuffdiff_dir = "/kb/module/work/nnc/cuffdiff"
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        # if (cuffdiff_dir is False):
        #    return returnVal
        fparams["cuffdiff_dir"] = cuffdiff_dir
        fparams["infile"] = join(cuffdiff_dir, "gene_exp.diff")
        fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter")

        fparams["pairs"] = 1
        fparams["logModetmp"] = 2

        rparams = {}

        rparams["cuffdiff_dir"] = fparams["cuffdiff_dir"]
        rparams["outpng"] = join(system_params["scratch"], "heatmap.png")
        rparams["imageheight"] = 1600
        rparams["imagewidth"] = 800
        rparams["plotscript"] = join(system_params["rscripts"], "heatmapplotinteractive.R")
        rparams["include_replicates"] = 1
        rparams["pairs"] = fparams["pairs"]
        rparams["logMode"] = fparams["logModetmp"]
        rparams["removezeroes"] = 1
        rparams["outmatrix"] = join(system_params["scratch"], "outmatrix")
        reportObj = {}

        provenance = [{}]
        if "provenance" in ctx:
            provenance = ctx["provenance"]
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]["input_ws_objects"] = [workspace + "/" + fparams["ws_cuffdiff_id"]]

        report = ""
        if fparams["pairs"] != 0:

            try:
                filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params)
                self.__LOGGER.info("matrix is " + filtered_matrix)
                fparams["infile"] = join(system_params["scratch"], "gene_exp.diff.filter")
                fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter.genelist")
                genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams)
                rparams["genelist"] = filtered_matrix
            except:
                report += "There was an error in creating expression matrix"
                report += "No differentially expressed genes were found"
                report += "Please change / double check  your filtering criteria"

                reportObj = {"objects_created": [], "text_message": report}

                reportName = "create_interactive_heatmap_de_genes_old_" + str(hex(uuid.getnode()))
                report_info = ws_client.save_objects(
                    {
                        "workspace": fparams["workspace_name"],
                        "objects": [
                            {
                                "type": "KBaseReport.Report",
                                "data": reportObj,
                                "name": reportName,
                                "meta": {},
                                "hidden": 1,  # important!  make sure the report is hidden
                                "provenance": provenance,
                            }
                        ],
                    }
                )[0]
                print ("saved Report: " + pformat(report_info))

                returnVal = {
                    "report_name": reportName,
                    "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]),
                }

                return [returnVal]

        try:
            # Prepare output object.
            outjson = False

            roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(rparams)

            # Run R script to run cummerbund json and update the cummerbund output json file
            # Prepare output object.
            outputobject = dict()

            # Prepare output plot list
            cummerbundplotset = []

            # List of plots to generate
            plotlist = [
                {
                    "roptstr": roptstr_basic_heatmap_rep,
                    "title": "Heatmap",
                    "description": "Heatmap",
                    "exp": fparams["ws_expression_matrix_id"],
                }
            ]
            fparams["cummerbundplotset"] = cummerbundplotset
            # Iterate through the plotlist and generate the images and json files.
            for plot in plotlist:
                fparams["title"] = plot["title"]
                fparams["description"] = plot["description"]

                status = script_util2.rplotanduploadinteractive(system_params, fparams, rparams, plot["roptstr"])
                if status == False:
                    self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"])
                else:

                    self.__LOGGER.info(status)

                    outjson = status
                    self.__LOGGER.info("xxxxxx1")
                    with open("{0}/{1}".format(self.__SCRATCH, outjson), "r") as et2:

                        eo2 = json.load(et2)
                        genome_ref = s_res[0]["data"]["genome_id"]
                        eo2["type"] = "log2_level"
                        eo2["genome_ref"] = genome_ref
                        self.__LOGGER.info("xxxxxx2")
                        self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot["exp"])
                        res = ws_client.save_objects(
                            {
                                "workspace": workspace,
                                "objects": [
                                    {"type": "KBaseFeatureValues.ExpressionMatrix", "data": eo2, "name": plot["exp"]}
                                ],
                            }
                        )

                        info = res[0]
                        self.__LOGGER("done uploading exp")
                        report = "Successfully created expression matrix"
                        reportObj = {
                            "objects_created": [
                                {
                                    "ref": str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]),
                                    "description": "Expression matrix",
                                }
                            ],
                            "text_message": report,
                        }

        except:
            report += "There was an error in generating expression matrix"
            reportObj = {"objects_created": [], "text_message": report}

        reportName = "create_interactive_heatmap_de_genes_" + str(hex(uuid.getnode()))
        report_info = ws_client.save_objects(
            {
                "workspace": fparams["workspace_name"],
                "objects": [
                    {
                        "type": "KBaseReport.Report",
                        "data": reportObj,
                        "name": reportName,
                        "meta": {},
                        "hidden": 1,  # important!  make sure the report is hidden
                        "provenance": provenance,
                    }
                ],
            }
        )[0]
        print ("saved Report: " + pformat(report_info))

        returnVal = {
            "report_name": reportName,
            "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]),
        }

        # END create_interactive_heatmap_de_genes_old

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                "Method create_interactive_heatmap_de_genes_old return value "
                + "returnVal is not type dict as required."
            )
        # return the results
        return [returnVal]
Beispiel #50
0
from biokbase.workspace.client import Workspace
ws_client = Workspace()
ws_next_client = Workspace(url='https://next.kbase.us/services/ws')
a, b = ws_next_client.get_objects([{'objid' : '4', 'wsid' : '68'}, {'objid' : '5', 'wsid' : '68'}])[0:2]
a_params = {'type' : a['info'][2], 'data': a['data']}
b_params = {'type' : b['info'][2], 'data': b['data']}
ws_client.save_objects({'id': '9145', 'objects': [a_params, b_params]})

Beispiel #51
0
    def generate_cummerbund_plot2(self, ctx, cummerbundstatParams):
        """
        :param cummerbundstatParams: instance of type "cummerbundstatParams"
           -> structure: parameter "workspace" of String, parameter
           "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws
           KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter
           "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of
           type "ws_diffstat_output" (Differential stat workspace id)
        :returns: instance of type "ws_cummerbund_output" (@id ws
           KBaseRNASeq.cummerbund_output)
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN generate_cummerbund_plot2
        params = cummerbundstatParams
        returnVal = params['ws_cummerbund_output']

        # Set up workspace client
        user_token = ctx['token']
        ws_client = Workspace(url=self.__WS_URL, token=user_token)

        # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file
        s_res = ws_client.get_objects([{
            'name': params['ws_cuffdiff_id'],
            'workspace': params['workspace']
        }])
        print "Getting genome info"

        genome_ref = s_res[0]['data']['genome_id']
        # genome_ref = '2702/6/2'
        # genome_ref = '2702/26/1'
        # genome_ref = '2229/21/10'
        print genome_ref
        gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token)
        genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}],
                                      "included_fields": ["scientific_name"],
                                      "included_feature_fields": ["id", "function", "type"
                                                                  ]})["genomes"][0]["data"]
        genome_dict = {}
        features = genome['features']
        for feature in features:
            id = feature['id']
            try:
                function = feature['function']
                if not function:
                    function = 'Unknown'
            except:
                function = 'Unknown'
            genome_dict[id] = function

        # Check if workspace has data
        if len(s_res) == 0:
            self.__LOGGER.info("Workspace did not return any objects")
            return returnVal

        cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL,
                                                          self.__SCRATCH, s_res, user_token)
        self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir)

        if (cuffdiff_dir is False):
            return returnVal

        # Run R script to run cummerbund json and update the cummerbund output json file
        # Prepare output object.
        outputobject = dict()

        # Prepare output plot list
        cummerbundplotset = []
        # List of plots to generate
        plotlist = [
            {'file': "dispersionplot.R",
             'title': "Dispersion plot",
             'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM."},

            {'file': "fpkmscvplot.R",
             'title': "Genes CV plot",
             'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data."},

            {'file': "isoformscvplot.R",
             'title': "Isoform CV plot",
             'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates."},

            {'file': "densityplot.R",
             'title': "Density plot",
             'description': "The density plot shows the distribution of FPKM scores across samples"},

            {'file': "csdensityrepplot.R",
             'title': "Replicates density plot",
             'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates"},

            {'file': "boxplot.R",
             'title': "Box plots",
             'description': "The box plots show the FPKM distribution across samples."},

            {'file': "boxrepplot.R",
             'title': "Box plots of replicates",
             'description': "The box plots of replicates show the FPKM distribution across sample replicates."},

            {'file': "pairwisescatterplots.R",
             'title': "Pairwise scatter plots",
             'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line."},

            {'file': "volcanomatrixplot.R",
             'title': "Volcano matrix plots",
             'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off."},

            {'file': "pcaplot.R",
             'title': "PCA plot",
             'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions."},

            {'file': "pcarepplot.R",
             'title': "PCA plot including replicates",
             'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates."},

            {'file': "mdsplot.R",
             'title': "Multi-dimensional scaling plot",
             'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. "},

            {'file': "mdsrepplot.R",
             'title': "Multi-dimensional scaling plot including replicates",
             'description': "Multi-dimensional scaling plot including replicates are  similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions."}
        ]

        # Iterate through the plotlist and generate the images and json files.
        for plot in plotlist:
            status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS,
                                                 plot['file'], self.__SHOCK_URL, self.__HS_URL,
                                                 user_token,
                                                 cummerbundplotset, plot['title'],
                                                 plot['description'], cuffdiff_dir)
            if status == False:
                self.__LOGGER.info("Problem generating image and json file - " + plot["file"])

        # Populate the output object
        outputobject['cummerbundplotSet'] = cummerbundplotset

        # TODO: Need to figure out how to get rnaseq experiment id
        outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id"
        outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id']

        res = ws_client.save_objects({
            "workspace": params['workspace'],
            "objects": [{
                "type": "KBaseRNASeq.cummerbund_output",
                "data": outputobject,
                "name": params["ws_cummerbund_output"]}]
        })

        infile = join(cuffdiff_dir, "gene_exp.diff")
        outfile = join(cuffdiff_dir, "gene_exp_diff.out")
        x = v.volcano_plot_data_parse_and_upload(infile, outfile, genome_dict)
        with open(outfile) as f:
            statdata = json.load(f)
        res = ws_client.save_objects({
            "workspace": params['workspace'],
            "objects": [{
                "type": "KBaseRNASeq.DifferentialExpressionStat",
                "data": statdata,
                "name": params["ws_diffstat_output"]}]
        })

        #END generate_cummerbund_plot2

        # At some point might do deeper type checking...
        if not isinstance(returnVal, basestring):
            raise ValueError('Method generate_cummerbund_plot2 return value ' +
                             'returnVal is not type basestring as required.')
        # return the results
        return [returnVal]