def download_workspace_data(ws_url, source_ws, source_obj, working_dir, logger): ws = Workspace(ws_url, token=TOKEN) objdata = ws.get_objects([{'ref': source_ws + '/' + source_obj}])[0] info = objdata['info'] if info[2].split('-')[0] != 'KBaseFile.AssemblyFile': raise ValueError( 'This method only works on the KBaseFile.AssemblyFile type') shock_url = objdata['data']['assembly_file']['file']['url'] shock_id = objdata['data']['assembly_file']['file']['id'] ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) source = objdata['data'].get('source') outfile = os.path.join(working_dir, source_obj) shock_node = shock_url + '/node/' + shock_id + '/?download' headers = {'Authorization': 'OAuth ' + TOKEN} with open(outfile, 'w') as f: response = requests.get(shock_node, stream=True, headers=headers) if not response.ok: try: err = json.loads(response.content)['error'][0] except: logger.error("Couldn't parse response error content: " + response.content) response.raise_for_status() raise Exception(str(err)) for block in response.iter_content(1024): if not block: break f.write(block) return shock_url, shock_id, ref, source
def get_object_uid(name): WS_URL = 'https://ci.kbase.us/services/ws/' from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) info = ws.get_objects( [dict(workspace=os.environ['KB_WORKSPACE_ID'], name=name)])[0]['info'] return '%s/%s/%s' % (info[6], info[0], info[4])
def TophatCall(self, ctx, params): # ctx is the context object # return variables are: returnVal # BEGIN TophatCall ws_client = Workspace(url=self.__WS_URL, token=user_token) hs = HandleService(url=self.__HS_URL, token=user_token) try: ### Make a function to download the workspace object and prepare dict of genome ,lib_type self.__LOGGER.info("Downloading RNASeq Sample file") try: ret = ws_client.get_objects( [ {"name": params["sample_id"], "workspace": params["ws_id"]}, {"name": params["reference"], "workspace": params["ws_id"]}, {"name": params["bowtie_index"], "workspace": params["ws_id"]}, {"name": params["annotation_gtf"], "workspace": params["ws_id"]}, ] ) except Exception, e: raise KBaseRNASeqException("Error Downloading objects from the workspace ") # Download reads from the JSON object genome = params["reference"] if "data" in reads: # if 'metadata' in reads['data']: # genome = reads['data']['metadata']['ref_genome'] if "singleend_sample" in reads["data"]: lib_type = "SingleEnd" # cmdstring = elif "pairedend_sample" in reads["data"]: lib_type = "PairedEnd"
def get_object_uid(name): WS_URL = "https://ci.kbase.us/services/ws/" from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) info = ws.get_objects([dict(workspace=os.environ["KB_WORKSPACE_ID"], name=name)])[0]["info"] return "%s/%s/%s" % (info[6], info[0], info[4])
def get_object_from_ref(ref): objid = int(ref.split("/")[1]) WS_URL = "https://ci.kbase.us/services/ws/" from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) return ws.get_objects([dict(workspace=os.environ["KB_WORKSPACE_ID"], objid=objid)])[0]["data"]
def get_probanno(self, ctx, input): # ctx is the context object # return variables are: output #BEGIN get_probanno ''' Convert a probabilistic annotation object into a human-readbable table. @param ctx Current context object @param input Dictionary with input parameters for function @return Dictionary keyed by gene to a list of tuples with roleset and likelihood @raise WrongVersionError when ProbAnno object version number is invalid ''' input = self._checkInputArguments(ctx, input, ['probanno', 'probanno_workspace'], { 'probanno_version': None } ) wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) probAnnoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"], input['probanno_version']) objectList = wsClient.get_objects( [ probAnnoObjectId ] ) probAnnoObject = objectList[0] if probAnnoObject['info'][2] != ProbAnnoType: message = 'ProbAnno object type %s is not %s for object %s' %(probAnnoObject['info'][2], ProbAnnoType, probAnnoObject['info'][1]) ctx.log_err(message) raise WrongVersionError(message) output = probAnnoObject["data"]["roleset_probabilities"] #END get_probanno # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method get_probanno return value ' + 'output is not type dict as required.') # return the results return [output]
def associateReads(self, ctx, params): # ctx is the context object # return variables are: returnVal # BEGIN associateReads user_token = ctx["token"] ws_client = Workspace(url=self.__WS_URL, token=user_token) out = dict() out["metadata"] = { k: v for k, v in params.iteritems() if not k in ("ws_id", "analysis_id", "genome_id", "singleend_sample", "pairedend_sample") and v is not None } self.__LOGGER.info("Uploading RNASeqSample {0}".format(out["metadata"]["sample_id"])) if "genome_id" in params and params["genome_id"] is not None: out["metadata"]["genome_id"] = script_util.get_obj_info( self.__LOGGER, self.__WS_URL, [params["genome_id"]], params["ws_id"], user_token )[0] if "analysis_id" in params and params["analysis_id"] is not None: g_ref = script_util.get_obj_info( self.__LOGGER, self.__WS_URL, [params["analysis_id"]], params["ws_id"], user_token )[0] out["analysis_id"] = g_ref if "singleend_sample" in params and params["singleend_sample"] is not None: try: s_res = ws_client.get_objects([{"name": params["singleend_sample"], "workspace": params["ws_id"]}]) out["singleend_sample"] = s_res[0]["data"] print out["singleend_sample"] except Exception, e: raise KBaseRNASeqException( "Error Downloading SingleEndlibrary object from the workspace {0},{1}".format( params["singleend_sample"], e ) )
def get_object_from_ref(ref): objid = int(ref.split('/')[1]) WS_URL = 'https://ci.kbase.us/services/ws/' from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) return ws.get_objects( [dict(workspace=os.environ['KB_WORKSPACE_ID'], objid=objid)])[0]['data']
def test_annotate(self): ''' Run pa-annotate on a valid Genome object and verify that the job runs and returns a valid ProbAnno object in the expected time.''' # Run the annotate() function to generate a ProbAnno object. paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token) jobid = paClient.annotate({ "genome": self._config["genomeid"], "genome_workspace": self._config["test_ws"], "probanno": self._config["probannoid"], "probanno_workspace": self._config["test_ws"] }) # Allow time for the command to run. time.sleep(float(self._config["runtime"])) # Make sure the job has completed. ujsClient = UserAndJobState(self._config['ujs_url'], token=self._token) jobList = ujsClient.list_jobs([self._config['test_user']], 'CE') jobCompleted = False for job in jobList: if jobid == job[0]: jobCompleted = True jobInfo = job self.assertTrue( jobCompleted, 'Job did not complete before timeout of %s seconds' % (self._config['runtime'])) # See if the job ended in error. details = '' if jobInfo[11] == 1: details = ujsClient.get_detailed_error(jobInfo[0]) self.assertEqual(jobInfo[11], 0, 'Job ended in error: %s' % (details)) # Look for the ProbAnno object in the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: probannoObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['probannoid'] } objectList = wsClient.get_objects([probannoObjectId]) probannoObject = objectList[0] self.assertEqual( probannoObject['info'][1], self._config['probannoid'], 'ProbAnno object id %s is not %s' % (probannoObject['info'][1], self._config['probannoid'])) except WorkspaceServerError as e: traceback.print_exc(file=sys.stderr) self.fail( msg= "The expected object %s did not get created in the workspace %s!\n" % (self._config["probannoid"], self._config["test_ws"]))
def fetch_narrative(nar_id, auth_token, url=ci_ws, file_name=None): """ Fetches a Narrative object with the given reference id (of the form ##/##). If a file_name is given, then it is printed to that file. If the narrative is found, the jsonized string of it is returned. If nothing is found, an empty Dict is returned. """ ws_client = Workspace(url=url, token=auth_token) nar_data = ws_client.get_objects([{'ref':nar_id}]) if len(nar_data) > 0: nar_json = json.dumps(nar_data[0]) if file_name is not None: f = open(file_name, 'w') f.write(nar_json) f.close() return nar_json return {}
def fetch_narrative(nar_id, auth_token, url=ci_ws, file_name=None): """ Fetches a Narrative object with the given reference id (of the form ##/##). If a file_name is given, then it is printed to that file. If the narrative is found, the jsonized string of it is returned. If nothing is found, an empty Dict is returned. """ ws_client = Workspace(url=url, token=auth_token) nar_data = ws_client.get_objects([{"ref": nar_id}]) if len(nar_data) > 0: nar_json = json.dumps(nar_data[0]) if file_name is not None: f = open(file_name, "w") f.write(nar_json) f.close() return nar_json return {}
def test_annotate(self): ''' Run pa-annotate on a valid Genome object and verify that the job runs and returns a valid ProbAnno object in the expected time.''' # Run the annotate() function to generate a ProbAnno object. paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token) jobid = paClient.annotate( { "genome": self._config["genomeid"], "genome_workspace": self._config["test_ws"], "probanno": self._config["probannoid"], "probanno_workspace": self._config["test_ws"] } ) # Allow time for the command to run. time.sleep(float(self._config["runtime"])) # Make sure the job has completed. ujsClient = UserAndJobState(self._config['ujs_url'], token=self._token) jobList = ujsClient.list_jobs([ self._config['test_user'] ], 'CE') jobCompleted = False for job in jobList: if jobid == job[0]: jobCompleted = True jobInfo = job self.assertTrue(jobCompleted, 'Job did not complete before timeout of %s seconds' %(self._config['runtime'])) # See if the job ended in error. details = '' if jobInfo[11] == 1: details = ujsClient.get_detailed_error(jobInfo[0]) self.assertEqual(jobInfo[11], 0, 'Job ended in error: %s' %(details)) # Look for the ProbAnno object in the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: probannoObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['probannoid'] } objectList = wsClient.get_objects( [ probannoObjectId ] ) probannoObject = objectList[0] self.assertEqual(probannoObject['info'][1], self._config['probannoid'], 'ProbAnno object id %s is not %s' %(probannoObject['info'][1], self._config['probannoid'])) except WorkspaceServerError as e: traceback.print_exc(file=sys.stderr) self.fail(msg = "The expected object %s did not get created in the workspace %s!\n" %(self._config["probannoid"], self._config["test_ws"]))
def test_calculate(self): ''' Run pa-calculate on a valid ProbAnno object and verify that the job runs and returns a valid RxnProbs object.''' # Run the calculate() function to generate a RxnProbs object. paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token) rxnprobsMetadata = paClient.calculate( { "probanno": self._config["probannoid"], "probanno_workspace": self._config["test_ws"], "rxnprobs": self._config["rxnprobsid"], "rxnprobs_workspace": self._config["test_ws"] } ) # Look for the RxnProbs object in the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: rxnprobsObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['rxnprobsid'] } objectList = wsClient.get_objects( [ rxnprobsObjectId ] ) rxnprobsObject = objectList[0] self.assertEqual(rxnprobsObject['info'][1], self._config['rxnprobsid'], 'RxnProbs object id %s is not %s' %(rxnprobsObject['info'][1], self._config['rxnprobsid'])) except WorkspaceServerError as e: traceback.print_exc(file=sys.stderr) self.fail(msg = "The expected object %s did not get created in the workspace %s!\n" %(self._config["rxnprobsid"], self._config["test_ws"]))
def get_rxnprobs(self, ctx, input): # ctx is the context object # return variables are: output #BEGIN get_rxnprobs ''' Convert a reaction probability object into a human-readable table. @param ctx Current context object @param input Dictionary with input parameters for function @return List of reaction_probability tuples @raise WrongVersionError when RxnProbs object version number is invalid ''' # Sanity check on input arguments input = self._checkInputArguments(ctx, input, [ "rxnprobs", "rxnprobs_workspace" ], { 'rxnprobs_version': None, 'sort_field': 'rxnid' } ) wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) rxnProbsObjectId = make_object_identity(input["rxnprobs_workspace"], input["rxnprobs"], input['rxnprobs_version']) objectList = wsClient.get_objects( [ rxnProbsObjectId ] ) rxnProbsObject = objectList[0] if rxnProbsObject['info'][2] != RxnProbsType: message = 'RxnProbs object type %s is not %s for object %s' %(rxnProbsObject['info'][2], RxnProbsType, rxnProbsObject['info'][1]) ctx.log_err(message) raise WrongVersionError(message) output = rxnProbsObject["data"]["reaction_probabilities"] if input['sort_field'] == 'rxnid': output.sort(key=lambda tup: tup[0]) elif input['sort_field'] == 'probability': output.sort(key=lambda tup: tup[1], reverse=True) #END get_rxnprobs # At some point might do deeper type checking... if not isinstance(output, list): raise ValueError('Method get_rxnprobs return value ' + 'output is not type list as required.') # return the results return [output]
def test_calculate(self): ''' Run pa-calculate on a valid ProbAnno object and verify that the job runs and returns a valid RxnProbs object.''' # Run the calculate() function to generate a RxnProbs object. paClient = ProbabilisticAnnotation(self._config["probanno_url"], token=self._token) rxnprobsMetadata = paClient.calculate({ "probanno": self._config["probannoid"], "probanno_workspace": self._config["test_ws"], "rxnprobs": self._config["rxnprobsid"], "rxnprobs_workspace": self._config["test_ws"] }) # Look for the RxnProbs object in the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: rxnprobsObjectId = { 'workspace': self._config['test_ws'], 'name': self._config['rxnprobsid'] } objectList = wsClient.get_objects([rxnprobsObjectId]) rxnprobsObject = objectList[0] self.assertEqual( rxnprobsObject['info'][1], self._config['rxnprobsid'], 'RxnProbs object id %s is not %s' % (rxnprobsObject['info'][1], self._config['rxnprobsid'])) except WorkspaceServerError as e: traceback.print_exc(file=sys.stderr) self.fail( msg= "The expected object %s did not get created in the workspace %s!\n" % (self._config["rxnprobsid"], self._config["test_ws"]))
def diff_p_distribution(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN diff_p_distribution try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## loading pvalue distribution FDT pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; pvfdt = OrderedDict(pvfdt) with open(self.PVFDT_FN, 'r') as myfile: pvfdt = json.load(myfile) data_obj_name = "{0}.fdt".format(param['out_figure_object_name']) pvfdt['id'] = data_obj_name fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"} sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : pvfdt, 'name' : data_obj_name}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END diff_p_distribution # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method diff_p_distribution return value ' + 'result is not type dict as required.') # return the results return [result]
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) #raise Exception(stderr) self.logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ] for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))} # parse clustering results cid2genelist = {} cid2stat = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh: glh.readline() # skip header for line in glh: cluster, mcor, msec = line.rstrip().replace('"','').split("\t") cid2stat[cluster]= [mcor, msec] with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.rstrip().replace('"','').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if(len(cid2genelist) < 1) : self.logger.error("Clustering failed") return empty_results("Error: No cluster output", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}}) ## Upload Clusters feature_clusters ={"original_data": "{0}/{1}".format(param['workspace_name'],param['object_name']), "feature_clusters": feature_clusters} ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters', 'data' : feature_clusters, 'name' : (param['out_object_name'])}]}) result = {'workspace_name' : param['workspace_name'], 'out_object_name' : param['out_object_name']} #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]
def run_filter_genes(workspace_service_url=None, param_file = None, level=logging.INFO, logger = None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) cmd_dowload_cvt_tsv = [FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(param['num_features']) if 'num_features' not in param and 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(param['p_value']) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined"); sys.exit(2) #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write(fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) ## Upload FVE from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing cmd_upload_expr = [TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: cmd_upload_expr.append('--genome_object_name') obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref'])) cmd_upload_expr.append(obj_infos[1]) tmp_ws = obj_infos[7] logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1])) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR,FINAL_FN),'r') as et: eo = json.load(et) if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format(expr['description'], " ".join(cmd_coex_filter)) if 'feature_mapping' in expr: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]}) ## Upload FeatureSet fs ={'description':'Differentially expressed genes generated by {0}'.format(" ".join(cmd_coex_filter)), 'elements': {}} with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]})
def filter_BlastOutput(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN filter_BlastOutput user_token=ctx['token'] ws_client=Workspace(url=self.__WS_URL, token=user_token) blast_outputs=ws_client.get_objects([{'name':params['in_id'], 'workspace': params['ws_id']}]) fs ={'elements': {}} fs['description'] = "FeatureSet from BlastOutput by " printedEvalue = False printedEntries = False if 'evalue' in params and params['evalue'] != "": fs['description'] += " E-value:{0}".format(params['evalue']) printedEvalue = True if 'entries' in params and (params['entries'] != "" or params['entries'] > 0): if(printedEvalue): fs['description'] += "," fs['description'] += " # of entries :{0}".format(params['entries']) printedEntries = True if not printedEvalue and not printedEntries: fs['description'] += "no filtering" if len(blast_outputs) != 1: fs['description'] = "No such blast output object was found : {0}/{1}".format(param['workspace_name'], param['object_name']) else: fm = {} f2g = {} for boid in blast_outputs[0]['data']['BlastOutput_iterations']['Iteration']: for hitd in boid['Iteration_hits']['Hit']: print hitd['Hit_def'] ali = hitd['Hit_def'].find('#') if(ali < 0): next fid = hitd['Hit_def'][0:ali] gri = hitd['Hit_def'].find('#', ali+1) if fid not in f2g: f2g[fid] = {} if (gri >= 0 and not gri == (ali+1)): grid = hitd['Hit_def'][(ali+1):gri] f2g[fid][grid] = 1 for hspd in hitd['Hit_hsps']['Hsp']: if fid in fm: if float(hspd['Hsp_evalue']) < fm[fid]: fm[fid] = float(hspd['Hsp_evalue']) else: fm[fid] = float(hspd['Hsp_evalue']) fms = sorted(fm.items(), key=lambda x: x[1], reverse=False) bol = len(fms) if params['entries'] != "" or int(params['entries']) > 0: if(int(params['entries']) < bol): bol = int(params['entries']) for i in range(bol): if(fms[i][1] > float(params['evalue'])): break if fms[i][0] in f2g: fs['elements'][fms[i][0]] = f2g[fms[i][0]].keys() else: fs['elements'][fms[i][0]] = [] ws_client.save_objects( {"workspace":params['ws_id'], "objects": [{ "type":"KBaseCollections.FeatureSet", "data":fs, "name":params['out_id']} ]}) #pprint(fs) returnVal = {'obj_name' : params['out_id'], 'ws_id' : params['ws_id']} #END filter_BlastOutput # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_BlastOutput return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal # Get input data Shock Id and Filename. cuffdiff_shock_id = s_res[0]['data']['file']['id'] cuffdiff_file_name = s_res[0]['data']['file']['file_name'] #cuffdiff_file_name =None filesize = None # Download tar file dx = script_util.download_file_from_shock( self.__LOGGER, self.__SHOCK_URL, cuffdiff_shock_id, cuffdiff_file_name, self.__SCRATCH, filesize, user_token) #Decompress tar file and keep it in a directory tarfile = join(self.__SCRATCH, cuffdiff_file_name) dstnExtractFolder = join(self.__SCRATCH, "cuffdiffData") if not os.path.exists(dstnExtractFolder): os.makedirs(dstnExtractFolder) untarStatus = script_util2.untar_files(self.__LOGGER, tarfile, dstnExtractFolder) if untarStatus == False: self.__LOGGER.info("Problem extracting the archive") return returnVal foldersinExtractFolder = os.listdir(dstnExtractFolder) if len(foldersinExtractFolder) == 0: self.__LOGGER.info("Problem extracting the archive") return returnVal # Run R script to run cummerbund json and update the cummerbund output json file cuffdiff_dir = join(dstnExtractFolder, foldersinExtractFolder[0]) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot" }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "PCA plot" }, { 'file': "fpkmscvplot.R", 'title': "FPKM SCV plot", 'description': "FPKM SCV plot" } ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace_name'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def create_interactive_heatmap_de_genes(self, ctx, interactiveHeatmapParams): # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes fparams = interactiveHeatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : fparams['ws_cuffdiff_id'], 'workspace' : fparams['workspace_name'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) # Prepare output object. outjson = False; rparams = {} rparams['genelist'] = filtered_matrix rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join (system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['outmatrix'] = join (system_params['scratch'], "outmatrix") roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp' : fparams['ws_expression_matrix_id'] } ] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type']='untransformed' #eo2['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo2, 'name' : plot['exp'] }]}) returnVal = fparams['ws_expression_matrix_id'] #END create_interactive_heatmap_de_genes # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_interactive_heatmap_de_genes return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def generate_cummerbund_plot2(self, ctx, cummerbundstatParams): """ :param cummerbundstatParams: instance of type "cummerbundstatParams" -> structure: parameter "workspace" of String, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of type "ws_diffstat_output" (Differential stat workspace id) :returns: instance of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) """ # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plot2 params = cummerbundstatParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace'] }]) print "Getting genome info" genome_ref = s_res[0]['data']['genome_id'] #genome_ref = '2702/6/2' #genome_ref = '2702/26/1' #genome_ref = '2229/21/10' print genome_ref gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token) genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "function", "type" ]})["genomes"][0]["data"] genome_dict = {} features = genome['features'] for feature in features: id = feature['id'] try: function = feature['function'] if not function: function = 'Unknown' except: function = 'Unknown' genome_dict[id] = function # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." } ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) infile = join(cuffdiff_dir, "gene_exp.diff") outfile = join(cuffdiff_dir, "gene_exp_diff.out") x=v.volcano_plot_data_parse_and_upload(infile,outfile, genome_dict) with open(outfile) as f: statdata = json.load(f) res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.DifferentialExpressionStat", "data":statdata, "name":params["ws_diffstat_output"]}] }) #END generate_cummerbund_plot2 # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plot2 return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams): """ :param heatmapParams: instance of type "heatmapParams" -> structure: parameter "workspace" of String, parameter "sample1" of String, parameter "sample2" of String, parameter "q_value_cutoff" of Double, parameter "log2_fold_change_cutoff" of Double, parameter "num_genes" of Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) :returns: instance of type "ResultsToReport" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes_old fparams = heatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : fparams['ws_cuffdiff_id'], 'workspace' : fparams['workspace'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/cuffdiffData/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['pairs']=1 fparams['logModetmp'] = 2 rparams = {} rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join (system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['pairs'] = fparams ['pairs'] rparams['logMode'] = fparams['logModetmp'] rparams['removezeroes'] = 1 rparams['outmatrix'] = join (system_params['scratch'], "outmatrix") reportObj = {} provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects']=[workspace+'/'+fparams['ws_cuffdiff_id']] report = "" if (fparams['pairs'] != 0): try: filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) rparams['genelist'] = filtered_matrix except: report += "There was an error in creating expression matrix" report += "No differentially expressed genes were found" report += "Please change / double check your filtering criteria" reportObj = { 'objects_created':[], 'text_message':report } reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } return [returnVal] try: # Prepare output object. outjson = False; roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp' : fparams['ws_expression_matrix_id'] } ] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) report = "Error: Please select a different cutoff criteria. None of the genes passed fold change and q-value-cutoff. " report += "Failed to create expression matrix with differentially expressed genes(" + fparams['ws_expression_matrix_id'] + "). No genes to show on heatmap." reportObj = { 'objects_created':[], 'text_message':report } reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } return [returnVal] else: self.__LOGGER.info(status) outjson = status self.__LOGGER.info('5') with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type']='log2_level' eo2['genome_ref'] = genome_ref self.__LOGGER.info('3') self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) try: res = ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo2, 'name' : plot['exp'] }]}) except: self.__LOGGER ("xxxx6") except: self.__LOGGER.info('6') report = "Successfully created expression matrix" reportObj = { 'objects_created':[], 'text_message':report } self.__LOGGER.info('7') reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } #END create_interactive_heatmap_de_genes_old # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method create_interactive_heatmap_de_genes_old return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def create_interactive_heatmap_de_genes(self, ctx, interactiveHeatmapParams): # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes fparams = interactiveHeatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': fparams['ws_cuffdiff_id'], 'workspace': fparams['workspace_name'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join(cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") filtered_matrix = script_util2.filter_expression_matrix( fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step( fparams) # Prepare output object. outjson = False rparams = {} rparams['genelist'] = filtered_matrix rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join(system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['outmatrix'] = join(system_params['scratch'], "outmatrix") roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic( rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [{ 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp': fparams['ws_expression_matrix_id'] }] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive( system_params, fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info( "Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type'] = 'untransformed' #eo2['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) ws_client.save_objects({ 'workspace': workspace, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': eo2, 'name': plot['exp'] }] }) returnVal = fparams['ws_expression_matrix_id'] #END create_interactive_heatmap_de_genes # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError( 'Method create_interactive_heatmap_de_genes return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [{ 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." }] #TODO.. Giving Rplot.pdf # { 'file': "dendrogramplot.R", # 'title': "Dendrogram", # 'description': "Dendrogram based on the JS (Jensen-Shannon divergence) distance" }, # # { 'file': "dendrogramrepplot.R", # 'title': "Dendrogram including replicates", # 'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" }, # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info( "Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace": params['workspace_name'], "objects": [{ "type": "KBaseRNASeq.cummerbund_output", "data": outputobject, "name": params["ws_cummerbund_output"] }] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def create_expression_matrix(self, ctx, expressionMatrixParams): # ctx is the context object # return variables are: returnVal #BEGIN create_expression_matrix params = expressionMatrixParams returnVal = params['ws_expression_matrix_id'] #Set up workspace client user_token = ctx['token'] workspace = params['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to get fpkmgenematrix.R # Prepare output object. outjson = False #outjson = "repfpkmgenematrix.R.matrix.txt.json"; if params['include_replicates'] == 0: scriptfile = "fpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace) else: scriptfile = "repfpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace) if outjson is False: self.__LOGGER.info("Creation of expression matrix failed") return returnVal with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et: eo = json.load(et) eo['type'] = 'untransformed' genome_ref = s_res[0]['data']['genome_id'] #eo['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params['ws_expression_matrix_id']) ws_client.save_objects({ 'workspace': workspace, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': eo, 'name': params['ws_expression_matrix_id'] }] }) #END create_expression_matrix # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_expression_matrix return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def load_new_genome_data(self, ctx, params): """ :param params: instance of type "LoadNewGenomeDataParams" -> structure: parameter "genome_ref" of String :returns: instance of type "GenomeData" (scientific_name - scientific name of the organism. taxonomy_id - NCBI taxonomic id of the organism. kingdom - taxonomic kingdom of the organism. scientific_lineage - scientific lineage of the organism. genetic_code - scientific name of the organism. organism_aliases - aliases for the organism associated with this GenomeAnnotation. assembly_source - source organization for the Assembly. assembly_source_id - identifier for the Assembly used by the source organization. assembly_source_date - date of origin the source indicates for the Assembly. gc_content - GC content for the entire Assembly. dna_size - total DNA size for the Assembly. num_contigs - number of contigs in the Assembly. contig_ids - contig identifier strings for the Assembly. external_source - name of the external source. external_source_date - date of origin the external source indicates for this GenomeAnnotation. release - release version for this GenomeAnnotation data. original_source_filename - name of the file used to generate this GenomeAnnotation. feature_type_counts - number of features of each type.) -> structure: parameter "scientific_name" of String, parameter "taxonomy_id" of Long, parameter "kingdom" of String, parameter "scientific_lineage" of list of String, parameter "genetic_code" of Long, parameter "organism_aliases" of list of String, parameter "assembly_source" of String, parameter "assembly_source_id" of String, parameter "assembly_source_date" of String, parameter "gc_content" of Double, parameter "dna_size" of Long, parameter "num_contigs" of Long, parameter "contig_ids" of list of String, parameter "external_source" of String, parameter "external_source_date" of String, parameter "release" of String, parameter "original_source_filename" of String, parameter "feature_type_counts" of mapping from String to Long, parameter "features" of list of type "FeatureData" (feature_id - identifier for this feature feature_type - the Feature type e.g., "mRNA", "CDS", "gene", ... feature_function - the functional annotation description feature_aliases - dictionary of Alias string to List of source string identifiers feature_dna_sequence_length - integer representing the length of the DNA sequence for convenience feature_dna_sequence - string containing the DNA sequence of the Feature feature_md5 - string containing the MD5 of the sequence, calculated from the uppercase string feature_locations - list of Feature regions, where the Feature bounds are calculated as follows: - For "+" strand, [start, start + length) - For "-" strand, (start - length, start] feature_publications - ist of any known publications related to this Feature feature_quality_warnings - list of strings indicating known data quality issues (note: not used for Genome type, but is used for GenomeAnnotation) feature_quality_score - quality value with unknown algorithm for Genomes, not calculated yet for GenomeAnnotations. feature_notes - notes recorded about this Feature feature_inference - inference information) -> structure: parameter "feature_id" of String, parameter "feature_type" of String, parameter "feature_function" of String, parameter "feature_aliases" of mapping from String to list of String, parameter "feature_dna_sequence_length" of Long, parameter "feature_dna_sequence" of String, parameter "feature_md5" of String, parameter "feature_locations" of list of type "Region" (contig_id - the identifier for the contig to which this region corresponds. strand - either a "+" or a "-", for the strand on which the region is located. start - starting position for this region. length - distance from the start position that bounds the end of the region.) -> structure: parameter "contig_id" of String, parameter "strand" of String, parameter "start" of Long, parameter "length" of Long, parameter "feature_publications" of list of String, parameter "feature_quality_warnings" of list of String, parameter "feature_quality_score" of list of String, parameter "feature_notes" of String, parameter "feature_inference" of String, parameter "protein" of type "ProteinData" (protein_id - protein identifier, which is feature ID plus ".protein" protein_amino_acid_sequence - amino acid sequence for this protein protein_function - function of protein protein_aliases - list of aliases for the protein protein_md5 - MD5 hash of the protein translation (uppercase)) -> structure: parameter "protein_id" of String, parameter "protein_amino_acid_sequence" of String, parameter "protein_function" of String, parameter "protein_aliases" of list of String, parameter "protein_md5" of String, parameter "protein_domain_locations" of list of String """ # ctx is the context object # return variables are: returnVal #BEGIN load_new_genome_data genome_ref = params['genome_ref'] ga = GenomeAnnotationAPI(self.services, ctx['token'], genome_ref) feature_types = ga.get_feature_types() feature_ids_by_type = ga.get_feature_ids({"type_list": feature_types}) feature_ids = [] feature_id_map = feature_ids_by_type['by_type'] for feature_type in feature_id_map: feature_ids.extend(feature_id_map[feature_type]) feature_map = ga.get_features(feature_ids) protein_map = ga.get_proteins() features = [] proteins = [] for feature_id in feature_map: feature = feature_map[feature_id] if feature_id in protein_map: protein = protein_map[feature_id] feature['protein'] = protein proteins.append(protein) features.append(feature) #genome_data = ga.get_summary() # It returnes None !!! Maybe something wasn't prepared at the end of upload from Genbank? # Temporary load genome summary from directly from Workspace (there are some fields not present) ws = Workspace(url=self.workspaceURL) genome_data = ws.get_objects([{"ref": genome_ref}])[0]["data"] genome_data.pop('publications', None) genome_data.pop('feature_lookup', None) if 'scientific_name' not in genome_data and 'display_sc_name' in genome_data: genome_data['scientific_name'] = genome_data['display_sc_name'] genome_data['features'] = features returnVal = genome_data #END load_new_genome_data # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method load_new_genome_data return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def get_object(name): WS_URL = 'https://ci.kbase.us/services/ws/' from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) return ws.get_objects( [dict(workspace=os.environ['KB_WORKSPACE_ID'], name=name)])[0]['data']
def runAnnotate(self, job): ''' Run an annotate job to create a ProbAnno typed object. A ProbAnno typed object is created in four steps: (1) extract amino acid sequences from a Genome typed object to a fasta file, (2) run a BLAST search using the amino acid sequences against the subsystem BLAST database, (3) calculate annotation likelihood scores for each roleset implied by the functions of proteins in subsystems, and (4) save the likelihood scores to a ProbAnno typed object. The Job dictionary contains three main sections: (1) input parameters to the annotate() function, (2) context of server instance running the annotate() function, and (3) config variables of server. @param job Job dictionary created by server's annotate() function @return Nothing (although job is marked as complete) ''' # The input parameters and user context for annotate() were stored in the job data for the job. input = job["input"] if input['verbose']: self.logger.set_log_level(log.DEBUG) self.ctx = job["context"] self.config = job['config'] # Create a DataParser object for working with the static database files. self.dataParser = DataParser(self.config) status = None try: # Make sure the database files are available. self.dataParser.checkIfDatabaseFilesExist() # Make sure the job directory exists. workFolder = make_job_directory(self.config['work_folder_path'], job['id']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.ctx['token']) # Get the Genome object from the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'getting genome object', 1, timestamp(3600)) except: pass wsClient = Workspace(self.config["workspace_url"], token=self.ctx['token']) genomeObjectId = make_object_identity(input["genome_workspace"], input["genome"]) objectList = wsClient.get_objects( [ genomeObjectId ] ) genomeObject = objectList[0] # Convert Genome object to fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'converting Genome object to fasta file', 1, timestamp(3600)) except: pass fastaFile = self._genomeToFasta(input, genomeObject, workFolder) # Run blast using the fasta file. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'running blast', 1, timestamp(3600)) except: pass blastResultFile = self._runBlast(input, fastaFile, workFolder) # Calculate roleset probabilities. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'calculating roleset probabilities', 1, timestamp(300)) except: pass rolestringTuples = self._rolesetProbabilitiesMarble(input, blastResultFile, workFolder) # Build ProbAnno object and store in the specified workspace. try: ujsClient.update_job_progress(job['id'], self.ctx['token'], 'building ProbAnno object', 1, timestamp(120)) except: pass output = self._buildProbAnnoObject(input, genomeObject, blastResultFile, rolestringTuples, workFolder, wsClient) # Mark the job as done. status = "done" tb = None self._log(log.INFO, 'Job '+job['id']+' finished for genome '+input['genome']+' to probanno '+input['probanno']) except: tb = traceback.format_exc() sys.stderr.write('\n'+tb) status = "failed" self._log(log.ERR, 'Job '+job['id']+' failed for genome '+input['genome']+' to probanno '+input['probanno']) # Mark the job as complete with the given status. ujsClient.complete_job(job['id'], self.ctx['token'], status, tb, { }) # Remove the temporary work directory. if self.logger.get_log_level() < log.DEBUG2 and status == 'done': try: shutil.rmtree(workFolder) except OSError: # For some reason deleting the directory was failing in production. Rather than have all jobs look like they failed # I catch and log the exception here (since the user still gets the same result if the directory remains intact) msg = 'Unable to delete temporary directory %s\n' %(workFolder) sys.stderr.write('WARNING: '+msg) self._log(log.WARNING, msg) return
def run_filter_genes(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [ COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y' ] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(param['num_features']) if 'num_features' not in param and 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(param['p_value']) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined") sys.exit(2) #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write( fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) ## Upload FVE from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing cmd_upload_expr = [ TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: cmd_upload_expr.append('--genome_object_name') obj_infos = ws.get_object_info_new( {"objects": [{ 'ref': expr['genome_ref'] }]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format( expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format( expr['genome_ref'])) cmd_upload_expr.append(obj_infos[1]) tmp_ws = obj_infos[7] logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1])) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et: eo = json.load(et) if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format( expr['description'], " ".join(cmd_coex_filter)) if 'feature_mapping' in expr: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': expr, 'name': (param['out_expr_object_name']) }] }) ## Upload FeatureSet fs = { 'description': 'Differentially expressed genes generated by {0}'.format( " ".join(cmd_coex_filter)), 'elements': {} } with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': fs, 'name': (param['out_fs_object_name']) }] })
def create_expression_matrix(self, ctx, expressionMatrixParams): """ :param expressionMatrixParams: instance of type "expressionMatrixParams" -> structure: parameter "workspace_name" of type "workspace_name" (workspace name of the object), parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "include_replicates" of type "bool" (indicates true or false values, false <= 0, true >=1) :returns: instance of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix) """ # ctx is the context object # return variables are: returnVal # BEGIN create_expression_matrix params = expressionMatrixParams returnVal = params["ws_expression_matrix_id"] # Set up workspace client user_token = ctx["token"] workspace = params["workspace_name"] ws_client = Workspace(url=self.__WS_URL, token=user_token) # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{"name": params["ws_cuffdiff_id"], "workspace": params["workspace_name"]}]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token ) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if cuffdiff_dir is False: return returnVal # Run R script to get fpkmgenematrix.R # Prepare output object. outjson = False # outjson = "repfpkmgenematrix.R.matrix.txt.json"; if params["include_replicates"] == 0: scriptfile = "fpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace, ) else: scriptfile = "repfpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace, ) if outjson is False: self.__LOGGER.info("Creation of expression matrix failed") return returnVal with open("{0}/{1}".format(self.__SCRATCH, outjson), "r") as et: eo = json.load(et) eo["type"] = "untransformed" genome_ref = s_res[0]["data"]["genome_id"] eo["genome_ref"] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params["ws_expression_matrix_id"]) ws_client.save_objects( { "workspace": workspace, "objects": [ { "type": "KBaseFeatureValues.ExpressionMatrix", "data": eo, "name": params["ws_expression_matrix_id"], } ], } ) # END create_expression_matrix # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError( "Method create_expression_matrix return value " + "returnVal is not type basestring as required." ) # return the results return [returnVal]
def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams): """ :param heatmapParams: instance of type "heatmapParams" -> structure: parameter "workspace" of String, parameter "sample1" of String, parameter "sample2" of String, parameter "q_value_cutoff" of Double, parameter "log2_fold_change_cutoff" of Double, parameter "num_genes" of Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) :returns: instance of type "ResultsToReport" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes_old fparams = heatmapParams returnVal = "ttt" # Set up workspace client user_token = ctx['token'] workspace = fparams['workspace'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': fparams['ws_cuffdiff_id'], 'workspace': fparams['workspace'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) # cuffdiff_dir = "/kb/module/work/cuffdiffData/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) # if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join(cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['pairs'] = 1 fparams['logModetmp'] = 2 rparams = {} rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join(system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['pairs'] = fparams['pairs'] rparams['logMode'] = fparams['logModetmp'] rparams['removezeroes'] = 1 rparams['outmatrix'] = join(system_params['scratch'], "outmatrix") reportObj = {} provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects'] = [workspace + '/' + fparams['ws_cuffdiff_id']] report = "" if (fparams['pairs'] != 0): try: filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) rparams['genelist'] = filtered_matrix except: report += "There was an error in creating expression matrix" report += "No differentially expressed genes were found" report += "Please change / double check your filtering criteria" reportObj = { 'objects_created': [], 'text_message': report } reportName = 'create_interactive_heatmap_de_genes_old_' + str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace': fparams['workspace'], 'objects': [ { 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, # important! make sure the report is hidden 'provenance': provenance } ]})[0] print('saved Report: ' + pformat(report_info)) returnVal = {"report_name": reportName, "report_ref": str(report_info[6]) + '/' + str( report_info[0]) + '/' + str(report_info[4])} return [returnVal] try: # Prepare output object. outjson = False; roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [ {'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp': fparams['ws_expression_matrix_id'] } ] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive(system_params, fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info( "Problem generating image and json file - " + plot["roptstr"]) report = "Error: Please select a different cutoff criteria. None of the genes passed fold change and q-value-cutoff. " report += "Failed to create expression matrix with differentially expressed genes(" + \ fparams['ws_expression_matrix_id'] + "). No genes to show on heatmap." reportObj = { 'objects_created': [], 'text_message': report } reportName = 'create_interactive_heatmap_de_genes_old_' + str( hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace': fparams['workspace'], 'objects': [ { 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, # important! make sure the report is hidden 'provenance': provenance } ]})[0] print('saved Report: ' + pformat(report_info)) returnVal = {"report_name": reportName, "report_ref": str(report_info[6]) + '/' + str( report_info[0]) + '/' + str(report_info[4])} return [returnVal] else: self.__LOGGER.info(status) outjson = status self.__LOGGER.info('5') with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type'] = 'log2_level' eo2['genome_ref'] = genome_ref self.__LOGGER.info('3') self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) try: res = ws_client.save_objects({'workspace': workspace, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': eo2, 'name': plot['exp'] }]}) except: self.__LOGGER("xxxx6") except: self.__LOGGER.info('6') report = "Successfully created expression matrix" reportObj = { 'objects_created': [], 'text_message': report } self.__LOGGER.info('7') reportName = 'create_interactive_heatmap_de_genes_old_' + str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace': fparams['workspace'], 'objects': [ { 'type': 'KBaseReport.Report', 'data': reportObj, 'name': reportName, 'meta': {}, 'hidden': 1, # important! make sure the report is hidden 'provenance': provenance } ]})[0] print('saved Report: ' + pformat(report_info)) returnVal = {"report_name": reportName, "report_ref": str(report_info[6]) + '/' + str(report_info[0]) + '/' + str( report_info[4])} #END create_interactive_heatmap_de_genes_old # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method create_interactive_heatmap_de_genes_old return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def create_expression_matrix(self, ctx, expressionMatrixParams): # ctx is the context object # return variables are: returnVal #BEGIN create_expression_matrix params = expressionMatrixParams returnVal = params['ws_expression_matrix_id'] #Set up workspace client user_token = ctx['token'] workspace = params['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to get fpkmgenematrix.R # Prepare output object. outjson = False; #outjson = "repfpkmgenematrix.R.matrix.txt.json"; if params['include_replicates'] ==0: scriptfile = "fpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL,workspace) else: scriptfile = "repfpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL,workspace) if outjson is False: self.__LOGGER.info("Creation of expression matrix failed") return returnVal with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et: eo = json.load(et) eo['type']='untransformed' genome_ref = s_res[0]['data']['genome_id'] #eo['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params['ws_expression_matrix_id']) ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo, 'name' : params['ws_expression_matrix_id'] }]}) #END create_expression_matrix # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_expression_matrix return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def run_coex_cluster(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_cluster2 Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(CLSTR_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) #raise Exception(stderr) logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [ COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(CLSTR_DIR, CLSTR_FN) ] for p in [ 'net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight' ]: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search( r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): logger.info(stderr) else: logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index = { expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids'])) } # parse clustering results cid2genelist = {} with open("{0}/{1}".format(CLSTR_DIR, CLSTR_FN), 'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.replace('"', '').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if (len(cid2genelist) < 1): logger.error("Clustering failed") return empty_results("Error: No cluster output", expr, workspace_service_url, param, logger, ws) #sys.exit(4) logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append({ "id_to_pos": {gene: pos_index[gene] for gene in cid2genelist[cluster]} }) ## Upload Clusters feature_clusters = { "original_data": "{0}/{1}".format(param['workspace_name'], param['object_name']), "feature_clusters": feature_clusters } ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.FeatureClusters', 'data': feature_clusters, 'name': (param['out_object_name']) }] })
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." } ] #TODO.. Giving Rplot.pdf # { 'file': "dendrogramplot.R", # 'title': "Dendrogram", # 'description': "Dendrogram based on the JS (Jensen-Shannon divergence) distance" }, # # { 'file': "dendrogramrepplot.R", # 'title': "Dendrogram including replicates", # 'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" }, # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace_name'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def run_filter_genes(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if (ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [ COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y' ] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined") return empty_results("One of p_value or num_features must be defined", expr, workspace_service_url, param, logger, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction try: with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write( fl ) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) except: logger.error("Output was not found") return empty_results("Increase p_value or specify num_features", expr, workspace_service_url, param, logger, ws) ## checking genelist with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if (len(gl) < 1): logger.error("No genes are selected") return empty_results("Increase p_value or specify num_features", expr, workspace_service_url, param, logger, ws) #sys.exit(4) ## Upload FVE # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing # Updates: change missing genome handling strategy by copying reference to working workspace cmd_upload_expr = [ TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: obj_infos = ws.get_object_info_new( {"objects": [{ 'ref': expr['genome_ref'] }]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format( expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format( expr['genome_ref'])) #tmp_ws = "{0}".format(obj_infos[7]) logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1])) if obj_infos[7] != param['workspace_name']: #we need to copy it from the other workspace try: logger.info( "trying to copy the referenced genome object : {0}".format( expr['genome_ref'])) ws.copy_object({ 'from': { 'ref': expr['genome_ref'] }, 'to': { 'workspace': param['workspace_name'], 'name': obj_infos[1] } }) # add genome_object_name only after successful copy cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) except: # no permission or any issues... then, give up providing genome reference logger.info("".join(traceback.format_exc())) pass else: # it is local... we can simply add reference without copying genome cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) logger.info(" ".join(cmd_upload_expr)) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et: eo = json.load(et) if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format( expr['description'], param['method']) if 'feature_mapping' in expr and 'feature_mapping' in eo: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': expr, 'name': (param['out_expr_object_name']) }] }) ## Upload FeatureSet fs = {'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format( param['method']) fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': fs, 'name': (param['out_fs_object_name']) }] })
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV" ) token = ctx['token'] param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token cmd_dowload_cvt_tsv = [ self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) #raise Exception(stderr) self.logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [ self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN) ] for p in [ 'net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight' ]: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search( r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index = { expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids'])) } # parse clustering results cid2genelist = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), 'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.replace('"', '').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if (len(cid2genelist) < 1): self.logger.error("Clustering failed") return empty_results("Error: No cluster output", expr, self.__WS_URL, param, self.logger, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append({ "id_to_pos": {gene: pos_index[gene] for gene in cid2genelist[cluster]} }) ## Upload Clusters feature_clusters = { "original_data": "{0}/{1}".format(param['workspace_name'], param['object_name']), "feature_clusters": feature_clusters } ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.FeatureClusters', 'data': feature_clusters, 'name': (param['out_object_name']) }] }) result = { 'workspace_name': param['workspace_name'], 'out_object_name': param['out_object_name'] } #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]
def net_clust (args) : ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) lseries = wsd.get_object({'id' : args.inobj_id, 'type' : 'KBaseExpression.ExpressionSeries', 'workspace' : args.ws_id})['data'] if lseries is None: raise COEXException("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id)) samples, sids, genome_id = {}, [], "" # assume only one genome id for gid in sorted(lseries['genome_expression_sample_ids_map'].keys()): genome_id = gid for samid in lseries['genome_expression_sample_ids_map'][gid]: sids.append({'ref': samid}) samples = wsd.get_objects(sids) break cif = open(args.exp_fn, 'w') header = ",".join([s['data']['source_id'] for s in samples]) cif.write(header + "\n") gids = samples[0]['data']['expression_levels'].keys() # each sample has same gids for gid in sorted(gids): line = gid + "," line += ",".join([str(s['data']['expression_levels'][gid]) for s in samples]) cif.write(line + "\n") cif.close() ### # generate network and cluster net_cmd_lst = ['coex_net', '-i', args.exp_fn] if (args.nmethod is not None): net_cmd_lst.append("-m") net_cmd_lst.append(args.nmethod) if (args.cut_off is not None): net_cmd_lst.append("-c") net_cmd_lst.append(args.cut_off) if (args.net_fn is not None): net_cmd_lst.append("-o") net_cmd_lst.append(args.net_fn) p1 = Popen(net_cmd_lst, stdout=PIPE) out_str = p1.communicate() if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] net_cmd = " ".join(net_cmd_lst) clust_cmd_lst = ['coex_cluster2', '-i', args.exp_fn] if (args.cmethod is not None): clust_cmd_lst.append("-c") clust_cmd_lst.append(args.cmethod) if (args.nmethod is not None): clust_cmd_lst.append("-n") clust_cmd_lst.append(args.nmethod) if (args.k is not None): clust_cmd_lst.append("-s") clust_cmd_lst.append(args.k) if (args.clust_fn is not None): clust_cmd_lst.append("-o") clust_cmd_lst.append(args.clust_fn) p1 = Popen(clust_cmd_lst, stdout=PIPE) out_str = p1.communicate() if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] clust_cmd = " ".join(clust_cmd_lst) ### # Create network object #generate Networks datasets net_ds_id = args.inobj_id + ".net" clt_ds_id = args.inobj_id + ".clt" datasets = [ { 'network_type' : 'FUNCTIONAL_ASSOCIATION', 'taxons' : [ genome_id ], 'source_ref' : 'WORKSPACE', 'name' : net_ds_id, 'id' : clt_ds_id, 'description' : "Coexpression network object of " + args.inobj_id, 'properties' : { 'original_data_type' : 'workspace', 'original_ws_id' : args.ws_id, 'original_obj_id' : args.inobj_id, 'coex_net_cmd' : net_cmd } }, { 'network_type' : 'FUNCTIONAL_ASSOCIATION', 'taxons' : [ genome_id ], 'source_ref' : 'WORKSPACE', 'name' : clt_ds_id, 'id' : clt_ds_id, 'description' : "Coexpression cluster object of " + args.inobj_id, 'properties' : { 'original_data_type' : 'workspace', 'original_ws_id' : args.ws_id, 'original_obj_id' : args.inobj_id, 'coex_clust_cmd' : clust_cmd } } ] # process coex network file nc = Node() cnf = open(args.net_fn,'r'); cnf.readline(); # skip header for line in cnf : line.strip(); line = line.replace('"','') values = line.split(',') if values[0] != values[1] : nc.add_edge(float(values[2]), net_ds_id, values[0], 'GENE', values[1], 'GENE', 0.0) #we add edges meaningful # process coex cluster file cnf = open(args.clust_fn,'r') cnf.readline(); # skip header for line in cnf : line = line.strip(); line = line.replace('"','') values = line.split(',') nc.add_edge(1.0, clt_ds_id, values[0], 'GENE', "cluster." + values[1], 'CLUSTER', 0.0) # generate Networks object net_object = { 'datasets' : datasets, 'nodes' : nc.nodes, 'edges' : nc.edges, 'user_annotations' : {}, 'name' : 'Coexpression Network', 'id' : args.outobj_id, 'properties' : { 'graphType' : 'edu.uci.ics.jung.graph.SparseMultigraph' } } # Store results object into workspace wsd.save_objects({'workspace' : args.ws_id, 'objects' : [{'type' : 'KBaseNetworks.Network', 'data' : net_object, 'name' : args.outobj_id, 'meta' : {'org_obj_id' : args.inobj_id, 'org_ws_id' : args.ws_id}}]}) if(args.del_tmps is "true") : os.remove(args.exp_fn) os.remove(args.net_fn) os.remove(args.clust_fn)
def diff_p_distribution(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN diff_p_distribution try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## loading pvalue distribution FDT pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; pvfdt = OrderedDict(pvfdt) with open(self.PVFDT_FN, 'r') as myfile: pvfdt = json.load(myfile) data_obj_name = "{0}.fdt".format(param['out_figure_object_name']) pvfdt['id'] = data_obj_name fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"} sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : pvfdt, 'name' : data_obj_name}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END diff_p_distribution # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method diff_p_distribution return value ' + 'result is not type dict as required.') # return the results return [result]
def filter_genes(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN filter_genes try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: self.logger.error("One of p_value or num_features must be defined"); return empty_results("One of p_value or num_features must be defined", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## Header correction try: with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'w') as ff: ff.write(fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) except: self.logger.error("Output was not found"); return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws) ## checking genelist with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if(len(gl) < 1) : self.logger.error("No genes are selected") return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(4) ## Upload FVE # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing # Updates: change missing genome handling strategy by copying reference to working workspace cmd_upload_expr = [self.TSV_2_FVE, '--workspace_service_url', self.__WS_URL, '--object_name', param['out_expr_object_name'], '--working_directory', self.FINAL_DIR, '--input_directory', self.FLTRD_DIR, '--output_file_name', self.FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0] if len(obj_infos) < 1: self.logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref'])) #tmp_ws = "{0}".format(obj_infos[7]) self.logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1])) if obj_infos[7] != param['workspace_name']: #we need to copy it from the other workspace try: self.logger.info("trying to copy the referenced genome object : {0}".format(expr['genome_ref'])) ws.copy_object({'from' : {'ref' : expr['genome_ref']},'to' : {'workspace': param['workspace_name'], 'name' : obj_infos[1]}}) # add genome_object_name only after successful copy cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) except: # no permission or any issues... then, give up providing genome reference self.logger.info("".join(traceback.format_exc())) pass else: # it is local... we can simply add reference without copying genome cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) self.logger.info(" ".join(cmd_upload_expr)) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) with open("{0}/{1}".format(self.FINAL_DIR,self.FINAL_FN),'r') as et: eo = json.load(et) if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method']) if 'feature_mapping' in expr and 'feature_mapping' in eo: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]}) ## Upload FeatureSet fs ={'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method']) fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]}) result = {'workspace_name' : param['workspace_name'], 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']} #END filter_genes # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method filter_genes return value ' + 'result is not type dict as required.') # return the results return [result]
def filter_genes(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN filter_genes try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']] from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: self.logger.error("One of p_value or num_features must be defined"); return error_report("One of p_value or num_features must be defined", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## checking genelist with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if(len(gl) < 1) : self.logger.error("No genes are selected") return error_report("Increase p_value or specify num_features", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(4) ## Upload FVE if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method']) expr = self._subselectExp(expr, gl) ex_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]})[0] ## Upload FeatureSet fs ={'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method']) fs['description'] += "from {0}/{1}".format(workspace_name, param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] fs_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]})[0] ## Create report object: report = "Filtering expression matrix using {0} on {1}".format(param['method'],param['object_name']) reportObj = { 'objects_created':[{ 'ref':"{0}/{1}/{2}".format(fs_info[6], fs_info[0], fs_info[4]), 'description':'Filtered FeatureSet' }, { 'ref':"{0}/{1}/{2}".format(ex_info[6], ex_info[0], ex_info[4]), 'description':'Filetered ExpressionMatrix' }], 'text_message':report } # generate a unique name for the Method report reportName = 'FilterExpression_'+str(hex(uuid.getnode())) report_info = ws.save_objects({ 'id':ex_info[6], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) } #result = {'workspace_name' : workspace_name, 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']} #END filter_genes # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method filter_genes return value ' + 'result is not type dict as required.') # return the results return [result]
def view_heatmap(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN view_heatmap try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Loading data") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) fc = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] if 'original_data' not in fc: raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix") oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0] df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids']) # cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, # '--workspace_name', oexpr['info'][7], # '--object_name', oexpr['info'][1], # '--working_directory', self.RAWEXPR_DIR, # '--output_file_name', self.EXPRESS_FN # ] # # # need shell in this case because the java code is depending on finding the KBase token in the environment # # -- copied from FVE_2_TSV # tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) # stdout, stderr = tool_process.communicate() # # if stdout is not None and len(stdout) > 0: # self.logger.info(stdout) # # if stderr is not None and len(stderr) > 0: # self.logger.info(stderr) # # df = pd.read_csv("{0}/{1}".format(self.RAWEXPR_DIR,self.EXPRESS_FN), sep='\t') # df2 = df[df.columns[1:]] # rn = df[df.columns[0]] # df2.index = rn # L2 normalization df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0) # type - ? level, ratio, log-ratio <---> "untransformed" # scale - ? probably: raw, ln, log2, log10 self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] )) if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass elif oexpr['data']['type'] == 'ratio': fc_cf = df2.apply(np.log2) elif oexpr['data']['type'] == 'log-ratio': fc_cf = df2 if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass else: # do the same thing with simple level or untransformed if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass self.logger.info("Compute cluster statistics") cl = {} afs = []; cid = 1; c_stat = pd.DataFrame() for cluster in fc['feature_clusters']: try: fs = cluster['id_to_pos'].keys() except: continue # couldn't find feature_set fsn = "Cluster_{0}".format(cid) cid +=1 c_stat.loc[fsn,'size'] = len(fs) if 'meancor' in cluster: c_stat.loc[fsn,'mcor'] = cluster['meancor'] else: pass # TODO: Add mean cor calculation later #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN if 'quantile' in param: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(float(param['quantile'])) else: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75) c1 = df3.loc[fs,].sum(axis=0) if df3.loc[fs,].shape[0] < 1: # empty continue cl[fsn] = fs #afs.extend(fs) #c1 = df3.loc[fs,].sum(axis=0) #c1 = c1 / np.sqrt(c1.pow(2).sum()) #if(len(cl.keys()) == 1): # centroids = c1.to_frame(fsn).T #else: # centroids.loc[fsn] = c1 # now we have centroids and statistics # let's subselect clusters min_features = 200 if 'min_features' in param : min_features = param['min_features'] c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max() c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max() if 'use_norm_weight' in param and param['use_norm_weight'] != 0: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0 * c_stat.loc[:,'nstdstat'] else: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1 * c_stat.loc[:,'stdstat'] c_stat.sort_values('weight', inplace=True, ascending=False) pprint(c_stat) centroids = pd.DataFrame() for i in range(c_stat.shape[0]): fsn = c_stat.index[i] fs = cl[fsn] if i != 0 and len(afs) + len(fs) > min_features : break; afs.extend(fs) c1 = df3.loc[fs,].sum(axis=0) c1 = c1 / np.sqrt(c1.pow(2).sum()) if(centroids.shape[0] < 1): centroids = c1.to_frame(fsn).T else: centroids.loc[fsn] = c1 pprint(centroids) if len(cl.keys()) == 0: raise Exception("No feature ids were mapped to dataset or no clusters were selected") # dataset centroid dc = df3.loc[afs,].sum(axis=0) dc = dc / np.sqrt(dc.pow(2).sum()) self.logger.info("Ordering Centroids and Data") # the most far away cluster centroid from dataset centroid fc = (centroids * dc).sum(axis=1).idxmin() # the most far away centroid centroid from fc ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin() # major direction to order on unit ball space md = centroids.loc[ffc,] - centroids.loc[fc,] # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all) corder = (centroids * md).sum(axis=1).sort_values() # cluster order coidx = corder.index dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order # get first fs table fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []} fig_properties['ygtick_labels'] = coidx.tolist() if 'fold_change' in param and param['fold_change'] == 1: frange = 2 if 'fold_change_range' in param: frange = float(param['fold_change_range']) final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) if 'fold_cutoff' in param and param['fold_cutoff'] == 1: final[final > frange] = frange final[final < - frange] = - frange else: fc_df0b = final.sub(final.min(axis=1), axis=0) final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange else: final=df2.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = df2.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) ## loading pvalue distribution FDT fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; #fdt = OrderedDict(fdt) fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose fdt['row_labels'] = final.columns.tolist() fdt['column_labels'] = final.index.tolist() # TODO: Add group label later fdt['id'] = "{0}.fdt".format(param['out_figure_object_name']) self.logger.info("Saving the results") sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : fdt, 'name' : "{0}.fdt".format(param['out_figure_object_name'])}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END view_heatmap # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method view_heatmap return value ' + 'result is not type dict as required.') # return the results return [result]
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']] from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # grouping information with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ] for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))} # parse clustering results cid2genelist = {} cid2stat = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh: glh.readline() # skip header for line in glh: cluster, mcor, msec = line.rstrip().replace('"','').split("\t") cid2stat[cluster]= [mcor, msec] with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.rstrip().replace('"','').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if(len(cid2genelist) < 1) : self.logger.error("Clustering failed") return error_report("Error: No cluster output", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}}) ## Upload Clusters feature_clusters ={"original_data": "{0}/{1}".format(workspace_name,param['object_name']), "feature_clusters": feature_clusters} cl_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters', 'data' : feature_clusters, 'name' : (param['out_object_name'])}]})[0] ## Create report object: report = "Clustering expression matrix using WGCNA on {0}".format(param['object_name']) reportObj = { 'objects_created':[ { 'ref':"{0}/{1}/{2}".format(cl_info[6], cl_info[0], cl_info[4]), 'description':'WGCNA FeatureClusters' }], 'text_message':report } # generate a unique name for the Method report reportName = 'WGCNA_Clusters_'+str(hex(uuid.getnode())) report_info = ws.save_objects({ 'id':cl_info[6], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) } #result = {'workspace_name' : workspace_name, 'out_object_name' : param['out_object_name']} #result = {'workspace' : workspace_name, 'output' : param['out_object_name']} #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]
def filter_expression (args) : ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) lseries = wsd.get_object({'id' : args.inobj_id, 'type' : 'KBaseExpression.ExpressionSeries', 'workspace' : args.ws_id})['data'] if lseries is None: raise COEXException("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id)) samples, sids, genome_id = {}, [], "" # assume only one genome id for gid in sorted(lseries['genome_expression_sample_ids_map'].keys()): genome_id = gid for samid in lseries['genome_expression_sample_ids_map'][gid]: sids.append({'ref': samid}) samples = wsd.get_objects(sids) break cif = open(args.exp_fn, 'w') header = ",".join([s['data']['source_id'] for s in samples]) cif.write(header + "\n") # find common gene list gids = set(samples[0]['data']['expression_levels'].keys()) # each sample has same gids for s in samples: gids = gids.intersection(set(s['data']['expression_levels'].keys())) for gid in sorted(gids): line = gid + "," line += ",".join([str(s['data']['expression_levels'][gid]) for s in samples]) cif.write(line + "\n") cif.close() sif = open(args.rp_smp_fn, 'w') sample = ",".join(map(str, range(len(samples)))) sif.write(sample + "\n") sif.close() ### # execute filtering flt_cmd_lst = ['coex_filter', "-i", args.exp_fn] if (args.method is not None): flt_cmd_lst.append('-m') flt_cmd_lst.append(args.method) if (args.p_value is not None): flt_cmd_lst.append('-p') flt_cmd_lst.append(args.p_value) if (args.num_genes is not None): flt_cmd_lst.append('-n') flt_cmd_lst.append(args.num_genes) if (args.flt_out_fn is not None): flt_cmd_lst.append('-o') flt_cmd_lst.append(args.flt_out_fn) if (args.rp_smp_fn is not None): flt_cmd_lst.append('-s') flt_cmd_lst.append(args.rp_smp_fn) p1 = Popen(flt_cmd_lst, stdout=PIPE) out_str = p1.communicate() # print output message for error tracking if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] flt_cmd = " ".join(flt_cmd_lst) ### # put it back to workspace elm = {}; fif = open(args.flt_out_fn, 'r') fif.readline(); # skip header nsamples = len(samples) for i in range(nsamples): elm[i] = {} for line in fif : line.strip(); values = line.split(',') gene_id = values[0].replace("\"", "") for i in range(nsamples): elm[i][gene_id] = float(values[i + 1]) data_list = []; sid_list =[]; for i in range(nsamples) : samples[i]['data']['expression_levels'] = elm[i] if samples[i]['data']['title'] is None: samples[i]['data']['title'] = " Filtered by coex-filter-genes" else : samples[i]['data']['title'] += " filtered by coex-filter-genes" if samples[i]['data']['description'] is None : samples[i]['data']['description'] = "Generated by " + flt_cmd else : samples[i]['data']['description'] += " Generated by " + flt_cmd samples[i]['data']['id']+=".filtered"; samples[i]['data']['source_id']+=".filtered"; data_list.append({'type' : 'KBaseExpression.ExpressionSample', 'data' : samples[i]['data'], 'name' : samples[i]['data']['id']}) sv_rst = wsd.save_objects({'workspace' : args.ws_id, 'objects' : data_list}) for i in range(nsamples):sid_list.append(str(sv_rst[i][6]) + "/" + str(sv_rst[i][0]) + "/" + str(sv_rst[i][4])) data_list = []; # assume only one genome id lseries['genome_expression_sample_ids_map'][genome_id] = sid_list lseries['title'] += " filtered by coex_filter for " + genome_id lseries['source_id'] += ".filtered" lseries['id'] = args.outobj_id data_list.append({'type' : 'KBaseExpression.ExpressionSeries', 'data' : lseries, 'name' : lseries['id'], 'meta' : {'org.series' : args.inobj_id}}) wsd.save_objects({'workspace' : args.ws_id, 'objects' : data_list}) if(args.del_tmps is "true") : os.remove(args.exp_fn) os.remove(args.rp_smp_fn) os.remove(args.flt_out_fn)
def view_heatmap(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN view_heatmap try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Loading data") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) fc = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] if 'original_data' not in fc: raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix") oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0] df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids']) # L2 normalization df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0) # type - ? level, ratio, log-ratio <---> "untransformed" # scale - ? probably: raw, ln, log2, log10 self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] )) # do default behavior factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) # now fc_df will be reset if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass elif oexpr['data']['type'] == 'ratio': fc_df = df2.apply(np.log2) elif oexpr['data']['type'] == 'log-ratio': fc_df = df2 if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass else: # do the same thing with simple level or untransformed if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass self.logger.info("Compute cluster statistics") cl = {} afs = []; cid = 1; c_stat = pd.DataFrame() for cluster in fc['feature_clusters']: try: fs = cluster['id_to_pos'].keys() except: continue # couldn't find feature_set fsn = "Cluster_{0}".format(cid) cid +=1 c_stat.loc[fsn,'size'] = len(fs) if 'meancor' in cluster: c_stat.loc[fsn,'mcor'] = cluster['meancor'] else: pass # TODO: Add mean cor calculation later #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN if 'quantile' in param: # enforcing quantile to be in [0 .. 1] rnage qt = float(param['quantile']) if qt > 1.0: qt = 1.0 if qt < 0.0: qt = 0.0 c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(qt) else: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75) c1 = df3.loc[fs,].sum(axis=0) if df3.loc[fs,].shape[0] < 1: # empty continue cl[fsn] = fs #afs.extend(fs) #c1 = df3.loc[fs,].sum(axis=0) #c1 = c1 / np.sqrt(c1.pow(2).sum()) #if(len(cl.keys()) == 1): # centroids = c1.to_frame(fsn).T #else: # centroids.loc[fsn] = c1 # now we have centroids and statistics # let's subselect clusters min_features = 200 if 'min_features' in param : min_features = param['min_features'] c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max() c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max() if 'use_norm_weight' in param and param['use_norm_weight'] != 0: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0 * c_stat.loc[:,'nstdstat'] else: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1 * c_stat.loc[:,'stdstat'] c_stat.sort_values('weight', inplace=True, ascending=False) pprint(c_stat) centroids = pd.DataFrame() for i in range(c_stat.shape[0]): fsn = c_stat.index[i] fs = cl[fsn] if i != 0 and len(afs) + len(fs) > min_features : break; afs.extend(fs) c1 = df3.loc[fs,].sum(axis=0) c1 = c1 / np.sqrt(c1.pow(2).sum()) if(centroids.shape[0] < 1): centroids = c1.to_frame(fsn).T else: centroids.loc[fsn] = c1 pprint(centroids) if len(cl.keys()) == 0: raise Exception("No feature ids were mapped to dataset or no clusters were selected") # dataset centroid dc = df3.loc[afs,].sum(axis=0) dc = dc / np.sqrt(dc.pow(2).sum()) self.logger.info("Ordering Centroids and Data") # the most far away cluster centroid from dataset centroid fc = (centroids * dc).sum(axis=1).idxmin() # the most far away centroid centroid from fc ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin() # major direction to order on unit ball space md = centroids.loc[ffc,] - centroids.loc[fc,] # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all) corder = (centroids * md).sum(axis=1).sort_values() # cluster order coidx = corder.index dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order # get first fs table fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []} fig_properties['ygtick_labels'] = coidx.tolist() if 'fold_change' in param and param['fold_change'] == 1: frange = 2 if 'fold_change_range' in param: frange = float(param['fold_change_range']) final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) if 'fold_cutoff' in param and param['fold_cutoff'] == 1: final[final > frange] = frange final[final < - frange] = - frange else: fc_df0b = final.sub(final.min(axis=1), axis=0) final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange else: final=df2.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = df2.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) ## loading pvalue distribution FDT fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; #fdt = OrderedDict(fdt) # Nan to None final = final.where(pd.notnull(final),None) fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose fdt['row_labels'] = final.columns.tolist() fdt['column_labels'] = final.index.tolist() # TODO: Add group label later fdt['id'] = "{0}.fdt".format(param['out_figure_object_name']) self.logger.info("Saving the results") sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : fdt, 'hidden':1, 'name' : "{0}.fdt".format(param['out_figure_object_name'])}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, #'hidden':1, 'name' : "{0}".format(param['out_figure_object_name'])}]}) #'name' : "{0}.fp".format(param['out_figure_object_name'])}]}) #mchp = {} #mchp['figure_obj'] = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) #sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.MulticlusterHeatmapPlot', # 'data' : mchp, # 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END view_heatmap # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method view_heatmap return value ' + 'result is not type dict as required.') # return the results return [result]
def calculate(self, ctx, input): # ctx is the context object # return variables are: output #BEGIN calculate ''' Compute reaction probabilities from a probabilistic annotation. The input dictionary must contain the following keys: probanno: Name of ProbAnno object to input probanno_workspace: Workspace from which to grab the ProbAnno object rxnprobs: Name of RxnProbs object rxnprobs_workspace: Workspace to which to save the RxnProbs object The following keys are optional: verbose: Print lots of messages on the progress of the algorithm template_model: Name of TemplateModel object template_workspace: Workspace from which to grab TemplateModel object @param ctx Current context object @param input Dictionary with input parameters for function @return Object info for RxnProbs object @raise WrongVersionError when ProbAnno object version number is invalid @raise ValueError when template_workspace input argument is not specified ''' # Sanity check on input arguments input = self._checkInputArguments(ctx, input, ["probanno", "probanno_workspace", "rxnprobs", "rxnprobs_workspace"], { "verbose" : False , "template_model" : None, "template_workspace" : None } ) # Make sure the static database files are ready. self._checkDatabaseFiles(ctx) # Set log level to INFO when verbose parameter is enabled. if input['verbose']: ctx.set_log_level(log.DEBUG) # Create a workspace client. wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) # Get the ProbAnno object from the specified workspace. probannoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"]) objectList = wsClient.get_objects( [ probannoObjectId ] ) probannoObject = objectList[0] if probannoObject['info'][2] != ProbAnnoType: message = "ProbAnno object type %s is not %s for object %s" %(probannoObject['info'][2], ProbAnnoType, probannoObject['info'][1]) ctx.log_err(message) raise WrongVersionError(message) genome = probannoObject["data"]["genome"] # Create a temporary directory for storing intermediate files when debug is turned on. if ctx.get_log_level() >= log.DEBUG2: workFolder = tempfile.mkdtemp("", "calculate-%s-" %(genome), self.config["work_folder_path"]) ctx.log_debug('Intermediate files saved in '+workFolder) else: workFolder = None # When a template model is specified, use it to build dictionaries for roles, # complexes, and reactions instead of retrieving from static database files. complexesToRoles = None reactionsToComplexes = None if input["template_model"] is not None or input["template_workspace"] is not None: if not(input["template_model"] is not None and input["template_workspace"] is not None) : message = "Template model workspace is required if template model ID is provided" ctx.log_err(message) raise ValueError(message) # Create a dictionary to map a complex to a list of roles and a dictionary # to map a reaction to a list of complexes. The dictionaries are specific to # the specified template model instead of covering everything in the central # data model. complexesToRoles = dict() reactionsToComplexes = dict() # Get the list of RoleComplexReactions for the template model from the # fba modeling service. The RoleComplexReactions structure has a list # of ComplexReactions structures for the given role. And each ComplexReactions # structure has a list of reactions for the given complex. fbaClient = fbaModelServices(self.config['fbamodeling_url'], token=ctx['token']) roleComplexReactionsList = fbaClient.role_to_reactions( { 'templateModel': input['template_model'], 'workspace': input['template_workspace'] } ) # Build the two dictionaries from the returned list. for rcr in roleComplexReactionsList: for complex in rcr['complexes']: complexId = re.sub(r'cpx0*(\d+)', r'kb|cpx.\1', complex['name']) # Convert ModelSEED format to KBase format if complexId in complexesToRoles: complexesToRoles[complexId].append(rcr['name']) else: complexesToRoles[complexId] = [ rcr['name'] ] for reaction in complex['reactions']: reactionId = reaction['reaction'] if reactionId in reactionsToComplexes: reactionsToComplexes[reactionId].append(complexId) else: reactionsToComplexes[reactionId] = [ complexId ] # Calculate per-gene role probabilities. roleProbs = self._rolesetProbabilitiesToRoleProbabilities(ctx, input, genome, probannoObject["data"]["roleset_probabilities"], workFolder) # Calculate whole cell role probabilities. # Note - eventually workFolder will be replaced with a rolesToReactions call totalRoleProbs = self._totalRoleProbabilities(ctx, input, genome, roleProbs, workFolder) # Calculate complex probabilities. complexProbs = self._complexProbabilities(ctx, input, genome, totalRoleProbs, workFolder, complexesToRequiredRoles = complexesToRoles) # Calculate reaction probabilities. reactionProbs = self._reactionProbabilities(ctx, input, genome, complexProbs, workFolder, rxnsToComplexes = reactionsToComplexes) # If the reaction probabilities were not calculated using the data from the fba modeling service # via the template model, we need to convert from the KBase ID format to the ModelSEED format. if input["template_model"] is None: reactionList = list() for index in range(len(reactionProbs)): reactionList.append(reactionProbs[index][0]) EntityAPI = CDMI_EntityAPI(self.config["cdmi_url"]) numAttempts = 4 while numAttempts > 0: try: numAttempts -= 1 reactionData = EntityAPI.get_entity_Reaction( reactionList, [ "source_id" ] ) if len(reactionList) == len(reactionData): numAttempts = 0 except HTTPError as e: pass for index in range(len(reactionProbs)): rxnId = reactionProbs[index][0] reactionProbs[index][0] = reactionData[rxnId]['source_id'] # Create a reaction probability object objectData = dict() objectData["genome"] = probannoObject["data"]["genome"] objectData['genome_workspace'] = probannoObject['data']['genome_workspace'] if input["template_model"] is None: objectData['template_model'] = 'None' else: objectData["template_model"] = input["template_model"] if input["template_workspace"] is None: objectData['template_workspace'] = 'None' else: objectData["template_workspace"] = input["template_workspace"] objectData["probanno"] = input['probanno'] objectData['probanno_workspace'] = input['probanno_workspace'] objectData["id"] = input["rxnprobs"] objectData["reaction_probabilities"] = reactionProbs objectMetaData = { "num_reaction_probs": len(objectData["reaction_probabilities"]) } objectProvData = dict() objectProvData['time'] = timestamp(0) objectProvData['service'] = os.environ['KB_SERVICE_NAME'] objectProvData['service_ver'] = ServiceVersion objectProvData['method'] = 'calculate' objectProvData['method_params'] = input.items() objectProvData['input_ws_objects'] = [ '%s/%s/%d' %(probannoObject['info'][7], probannoObject['info'][1], probannoObject['info'][4]) ] objectSaveData = dict(); objectSaveData['type'] = RxnProbsType objectSaveData['name'] = input["rxnprobs"] objectSaveData['data'] = objectData objectSaveData['meta'] = objectMetaData objectSaveData['provenance'] = [ objectProvData ] objectInfo = wsClient.save_objects( { 'workspace': input["rxnprobs_workspace"], 'objects': [ objectSaveData ] } ) output = objectInfo[0] #END calculate # At some point might do deeper type checking... if not isinstance(output, list): raise ValueError('Method calculate return value ' + 'output is not type list as required.') # return the results return [output]
def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams): """ :param heatmapParams: instance of type "heatmapParams" -> structure: parameter "sample1" of String, parameter "sample2" of String, parameter "q_value_cutoff" of Double, parameter "log2_fold_change_cutoff" of Double, parameter "num_genes" of Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id1" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_expression_matrix_id2" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) :returns: instance of type "ResultsToReport" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal # BEGIN create_interactive_heatmap_de_genes_old fparams = heatmapParams # returnVal = "ttt" # Set up workspace client user_token = ctx["token"] workspace = fparams["workspace_name"] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params["token"] = user_token system_params["ws_url"] = self.__WS_URL system_params["logger"] = self.__LOGGER system_params["shock_url"] = self.__SHOCK_URL system_params["hs_url"] = self.__HS_URL system_params["scratch"] = self.__SCRATCH system_params["rscripts"] = self.__RSCRIPTS system_params["workspace"] = workspace # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{"name": fparams["ws_cuffdiff_id"], "workspace": fparams["workspace_name"]}]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token ) # cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) # if (cuffdiff_dir is False): # return returnVal fparams["cuffdiff_dir"] = cuffdiff_dir fparams["infile"] = join(cuffdiff_dir, "gene_exp.diff") fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter") fparams["pairs"] = 1 fparams["logModetmp"] = 2 rparams = {} rparams["cuffdiff_dir"] = fparams["cuffdiff_dir"] rparams["outpng"] = join(system_params["scratch"], "heatmap.png") rparams["imageheight"] = 1600 rparams["imagewidth"] = 800 rparams["plotscript"] = join(system_params["rscripts"], "heatmapplotinteractive.R") rparams["include_replicates"] = 1 rparams["pairs"] = fparams["pairs"] rparams["logMode"] = fparams["logModetmp"] rparams["removezeroes"] = 1 rparams["outmatrix"] = join(system_params["scratch"], "outmatrix") reportObj = {} provenance = [{}] if "provenance" in ctx: provenance = ctx["provenance"] # add additional info to provenance here, in this case the input data object reference provenance[0]["input_ws_objects"] = [workspace + "/" + fparams["ws_cuffdiff_id"]] report = "" if fparams["pairs"] != 0: try: filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams["infile"] = join(system_params["scratch"], "gene_exp.diff.filter") fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) rparams["genelist"] = filtered_matrix except: report += "There was an error in creating expression matrix" report += "No differentially expressed genes were found" report += "Please change / double check your filtering criteria" reportObj = {"objects_created": [], "text_message": report} reportName = "create_interactive_heatmap_de_genes_old_" + str(hex(uuid.getnode())) report_info = ws_client.save_objects( { "workspace": fparams["workspace_name"], "objects": [ { "type": "KBaseReport.Report", "data": reportObj, "name": reportName, "meta": {}, "hidden": 1, # important! make sure the report is hidden "provenance": provenance, } ], } )[0] print ("saved Report: " + pformat(report_info)) returnVal = { "report_name": reportName, "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]), } return [returnVal] try: # Prepare output object. outjson = False roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [ { "roptstr": roptstr_basic_heatmap_rep, "title": "Heatmap", "description": "Heatmap", "exp": fparams["ws_expression_matrix_id"], } ] fparams["cummerbundplotset"] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams["title"] = plot["title"] fparams["description"] = plot["description"] status = script_util2.rplotanduploadinteractive(system_params, fparams, rparams, plot["roptstr"]) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status self.__LOGGER.info("xxxxxx1") with open("{0}/{1}".format(self.__SCRATCH, outjson), "r") as et2: eo2 = json.load(et2) genome_ref = s_res[0]["data"]["genome_id"] eo2["type"] = "log2_level" eo2["genome_ref"] = genome_ref self.__LOGGER.info("xxxxxx2") self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot["exp"]) res = ws_client.save_objects( { "workspace": workspace, "objects": [ {"type": "KBaseFeatureValues.ExpressionMatrix", "data": eo2, "name": plot["exp"]} ], } ) info = res[0] self.__LOGGER("done uploading exp") report = "Successfully created expression matrix" reportObj = { "objects_created": [ { "ref": str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]), "description": "Expression matrix", } ], "text_message": report, } except: report += "There was an error in generating expression matrix" reportObj = {"objects_created": [], "text_message": report} reportName = "create_interactive_heatmap_de_genes_" + str(hex(uuid.getnode())) report_info = ws_client.save_objects( { "workspace": fparams["workspace_name"], "objects": [ { "type": "KBaseReport.Report", "data": reportObj, "name": reportName, "meta": {}, "hidden": 1, # important! make sure the report is hidden "provenance": provenance, } ], } )[0] print ("saved Report: " + pformat(report_info)) returnVal = { "report_name": reportName, "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]), } # END create_interactive_heatmap_de_genes_old # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( "Method create_interactive_heatmap_de_genes_old return value " + "returnVal is not type dict as required." ) # return the results return [returnVal]
from biokbase.workspace.client import Workspace ws_client = Workspace() ws_next_client = Workspace(url='https://next.kbase.us/services/ws') a, b = ws_next_client.get_objects([{'objid' : '4', 'wsid' : '68'}, {'objid' : '5', 'wsid' : '68'}])[0:2] a_params = {'type' : a['info'][2], 'data': a['data']} b_params = {'type' : b['info'][2], 'data': b['data']} ws_client.save_objects({'id': '9145', 'objects': [a_params, b_params]})
def generate_cummerbund_plot2(self, ctx, cummerbundstatParams): """ :param cummerbundstatParams: instance of type "cummerbundstatParams" -> structure: parameter "workspace" of String, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of type "ws_diffstat_output" (Differential stat workspace id) :returns: instance of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) """ # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plot2 params = cummerbundstatParams returnVal = params['ws_cummerbund_output'] # Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace'] }]) print "Getting genome info" genome_ref = s_res[0]['data']['genome_id'] # genome_ref = '2702/6/2' # genome_ref = '2702/26/1' # genome_ref = '2229/21/10' print genome_ref gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token) genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "function", "type" ]})["genomes"][0]["data"] genome_dict = {} features = genome['features'] for feature in features: id = feature['id'] try: function = feature['function'] if not function: function = 'Unknown' except: function = 'Unknown' genome_dict[id] = function # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data(self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [ {'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM."}, {'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data."}, {'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates."}, {'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples"}, {'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates"}, {'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples."}, {'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates."}, {'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line."}, {'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off."}, {'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions."}, {'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates."}, {'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. "}, {'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions."} ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset # TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace": params['workspace'], "objects": [{ "type": "KBaseRNASeq.cummerbund_output", "data": outputobject, "name": params["ws_cummerbund_output"]}] }) infile = join(cuffdiff_dir, "gene_exp.diff") outfile = join(cuffdiff_dir, "gene_exp_diff.out") x = v.volcano_plot_data_parse_and_upload(infile, outfile, genome_dict) with open(outfile) as f: statdata = json.load(f) res = ws_client.save_objects({ "workspace": params['workspace'], "objects": [{ "type": "KBaseRNASeq.DifferentialExpressionStat", "data": statdata, "name": params["ws_diffstat_output"]}] }) #END generate_cummerbund_plot2 # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plot2 return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]