def test_loadGenome(self): ''' Load a test Genome object into the test workspace. ''' # Create the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: # See if the workspace exists. wsInfo = wsClient.get_workspace_info( { "workspace": self._config["test_ws"] } ) except WorkspaceServerError as e: # Hopefully this means the workspace does not exist. (It could also mean someone messed up setting up the URLs) traceback.print_exc(file=sys.stderr) wsInfo = wsClient.create_workspace( { "workspace": self._config["test_ws"] } ) # We also need to put in a mapping and a biochemistry object somewhere. # To do this, I just create a "dependency workspace" and pull them from there. try: # See if the workspace exists. wsInfo = wsClient.get_workspace_info( { "workspace": self._config["dependency_ws"] } ) except WorkspaceServerError as e: # Hopefully this means the workspace does not exist. (It could also mean someone messed up setting up the URLs) # traceback.print_exc(file=sys.stderr) depWsInfo = wsClient.create_workspace( { "workspace": self._config["dependency_ws"] } ) # Load the mapping and biochemistry objects testContigSet = json.load(open(self._config['contigset_file'], 'r')) contigSetSaveData = dict() contigSetSaveData['type'] = 'KBaseGenomes.ContigSet' contigSetSaveData['name'] = self._config['contigsetid'] contigSetSaveData['data'] = testContigSet testGenome = json.load(open(self._config["genome_file"], "r")) genomeSaveData = dict() genomeSaveData['type'] = 'KBaseGenomes.Genome' genomeSaveData['name'] = self._config['genomeid'] genomeSaveData['data'] = testGenome wsClient.save_objects( { 'workspace': self._config['test_ws'], 'objects': [ genomeSaveData, contigSetSaveData ] } )
def handler (args) : ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) indata = wsd.get_object({'id' : args.inobj_id, #'type' : 'KBaseExpression.ExpressionSeries', 'workspace' : args.ws_id})['data'] if indata is None: raise Exception("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id)) ### # execute filtering flt_cmd_lst = ['mys_example', "-i", "{}-{}".format(os.getpid(),args.exp_fn) ] if (args.method is not None): flt_cmd_lst.append('-m') flt_cmd_lst.append(args.method) if (args.p_value is not None): flt_cmd_lst.append('-p') flt_cmd_lst.append(args.p_value) if (args.num_genes is not None): flt_cmd_lst.append('-n') flt_cmd_lst.append(args.num_genes) if (args.flt_out_fn is not None): flt_cmd_lst.append('-o') flt_cmd_lst.append("{}-{}".format(os.getpid(),args.flt_out_fn)) p1 = Popen(flt_cmd_lst, stdout=PIPE) out_str = p1.communicate() # print output message for error tracking if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] flt_cmd = " ".join(flt_cmd_lst) ### # put it back to workspace #fif = open("{}-{}".format(os.getpid(),args.flt_out_fn), 'r') #fif.readline(); # skip header # assume only one genome id outdata = {} outdata['key'] = indata['key'] outdata['value'] = "{}{}".format(indata['value'], indata['value']) data_list = [] data_list.append({'type' : 'MyService.PairString', 'data' : outdata, 'name' : args.outobj_id, 'meta' : {'org.series' : args.inobj_id}}) wsd.save_objects({'workspace' : args.ws_id, 'objects' : data_list}) if(args.del_tmps is "true") : os.remove("{}-{}".format(os.getpid(), args.exp_fn)) os.remove("{}-{}".format(os.getpid(), args.flt_out_fn))
def test_handles(self): wsName = self.generatePesudoRandomWorkspaceName() self.ws.set_permissions({ 'workspace': wsName, 'new_permission': 'w', 'users': [self.ctx2['user_id']] }) temp_shock_file = "/kb/module/work/tmp/shock1.txt" with open(temp_shock_file, "w") as f1: f1.write("Test Shock Handle") token1 = self.ctx['token'] dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token1) handle1 = dfu.file_to_shock({ 'file_path': temp_shock_file, 'make_handle': 1 })['handle'] hid1 = handle1['hid'] genome_name = "Genome.1" ws2 = Workspace(self.cfg['workspace-url'], token=token1) ws2.save_objects({ 'workspace': wsName, 'objects': [{ 'name': genome_name, 'type': 'KBaseGenomes.Genome', 'data': { 'id': "qwerty", 'scientific_name': "Qwerty", 'domain': "Bacteria", 'genetic_code': 11, 'genbank_handle_ref': hid1 } }] }) genome = self.impl.get_genome_v1( self.ctx2, {'genomes': [{ 'ref': wsName + '/' + genome_name }]})[0]['genomes'][0]['data'] self.impl.save_one_genome_v1(self.ctx2, { 'workspace': wsName, 'name': genome_name, 'data': genome })[0] genome = self.impl.get_genome_v1( self.ctx2, {'genomes': [{ 'ref': wsName + '/' + genome_name }]})[0]['genomes'][0]['data'] self.assertTrue('genbank_handle_ref' in genome) hid2 = genome['genbank_handle_ref'] self.assertNotEqual(hid1, hid2)
def upload_workspace_data(cs, ws_url, source_ref, target_ws, obj_name): ws = Workspace(ws_url, token=TOKEN) type_ = ws.translate_from_MD5_types([CS_MD5_TYPE])[CS_MD5_TYPE][0] ws.save_objects( {'workspace': target_ws, 'objects': [{'name': obj_name, 'type': type_, 'data': cs, 'provenance': [{'script': SCRIPT_NAME, 'script_ver': __VERSION__, 'input_ws_objects': [source_ref], }] } ] } )
def upload_narrative(nar_file, auth_token, user_id, url=ci_ws, set_public=False): """ Uploads a Narrative from a downloaded object file. This file needs to be in JSON format, and it expects all data and info that is usually returned by the Workspace.get_objects method. Returns a dict of three elements: ws: the id of the workspace that was created obj: the id of the narrative object ref: the above two joined together into an object ref (for convenience) """ # read the file f = open(nar_file, "r") nar = json.loads(f.read()) f.close() # do some setup. current_nar_metadata = ws_metadata current_nar_metadata["narrative_nice_name"] = nar["data"]["metadata"]["name"] ws_client = Workspace(url=url, token=auth_token) # create the new workspace for the narrative ws_info = ws_client.create_workspace( { "workspace": "{}:{}".format(user_id, str(time.time()).replace(".", "")), "meta": current_nar_metadata, "globalread": "r" if set_public else "n", } ) ws_id = ws_info[0] # setup and save the narrative object nar["info"][10] ws_save_obj = { "type": "KBaseNarrative.Narrative", "data": nar["data"], "name": nar["info"][1], "meta": nar["info"][10], "provenance": [ { "script": "upload_narrative_test.py", "description": "Temporary Narrative uploaded for automated testing", } ], } obj_info = ws_client.save_objects({"id": ws_id, "objects": [ws_save_obj]}) # tweak the workspace's metadata to properly present its narrative ws_client.alter_workspace_metadata( {"wsi": {"id": ws_id}, "new": {"narrative": obj_info[0][0]}} ) return { "ws": ws_info[0], "obj": obj_info[0][0], "refstr": "{}/{}".format(ws_info[0], obj_info[0][0]), "ref": NarrativeRef({"wsid": ws_info[0], "objid": obj_info[0][0]}), }
def create(self, ctx, params): # ctx is the context object # return variables are: info #BEGIN create print('Creating KBase Report.') # check that the basic parameters are set if 'report' not in params: raise ValueError('Field "report" must be defined to save a report') if 'workspace_name' not in params: raise ValueError( 'Field "workspace_name" must be defined to save a report') # setup proper provenance for the report provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # generate a random report name reportName = 'report_' + str(uuid.uuid4()) if 'prefix' in params: reportName = params['prefix'] + reportName print('Report Name' + reportName) # let any workspace errors just percolate up for now ws = Workspace(self.workspaceURL, token=ctx['token']) report_info = ws.save_objects({ 'workspace': params['workspace_name'], 'objects': [{ 'type': 'KBaseReport.Report', 'data': params['report'], 'name': reportName, 'meta': {}, 'hidden': 1, 'provenance': provenance }] })[0] info = { 'ref': str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]), 'name': report_info[1] } #END create # At some point might do deeper type checking... if not isinstance(info, dict): raise ValueError('Method create return value ' + 'info is not type dict as required.') # return the results return [info]
def create(self, ctx, params): # ctx is the context object # return variables are: info #BEGIN create print('Creating KBase Report.') # check that the basic parameters are set if 'report' not in params: raise ValueError('Field "report" must be defined to save a report') if 'workspace_name' not in params: raise ValueError('Field "workspace_name" must be defined to save a report') # setup proper provenance for the report provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # generate a random report name reportName = 'report_'+str(uuid.uuid4()) if 'prefix' in params: reportName = params['prefix'] + reportName print('Report Name' + reportName) # let any workspace errors just percolate up for now ws = Workspace(self.workspaceURL, token=ctx['token']) report_info = ws.save_objects({ 'workspace':params['workspace_name'], 'objects':[ { 'type':'KBaseReport.Report', 'data':params['report'], 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] info = { 'ref' : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]), 'name' : report_info[1] } #END create # At some point might do deeper type checking... if not isinstance(info, dict): raise ValueError('Method create return value ' + 'info is not type dict as required.') # return the results return [info]
def test_loadGenome(self): ''' Load a test Genome object into the test workspace. ''' # Create the test workspace. wsClient = Workspace(self._config["workspace_url"], token=self._token) try: # See if the workspace exists. wsInfo = wsClient.get_workspace_info( {"workspace": self._config["test_ws"]}) except WorkspaceServerError as e: # Hopefully this means the workspace does not exist. (It could also mean someone messed up setting up the URLs) traceback.print_exc(file=sys.stderr) wsInfo = wsClient.create_workspace( {"workspace": self._config["test_ws"]}) # We also need to put in a mapping and a biochemistry object somewhere. # To do this, I just create a "dependency workspace" and pull them from there. try: # See if the workspace exists. wsInfo = wsClient.get_workspace_info( {"workspace": self._config["dependency_ws"]}) except WorkspaceServerError as e: # Hopefully this means the workspace does not exist. (It could also mean someone messed up setting up the URLs) # traceback.print_exc(file=sys.stderr) depWsInfo = wsClient.create_workspace( {"workspace": self._config["dependency_ws"]}) # Load the mapping and biochemistry objects testContigSet = json.load(open(self._config['contigset_file'], 'r')) contigSetSaveData = dict() contigSetSaveData['type'] = 'KBaseGenomes.ContigSet' contigSetSaveData['name'] = self._config['contigsetid'] contigSetSaveData['data'] = testContigSet testGenome = json.load(open(self._config["genome_file"], "r")) genomeSaveData = dict() genomeSaveData['type'] = 'KBaseGenomes.Genome' genomeSaveData['name'] = self._config['genomeid'] genomeSaveData['data'] = testGenome wsClient.save_objects({ 'workspace': self._config['test_ws'], 'objects': [genomeSaveData, contigSetSaveData] })
def upload_workspace_data(cs, ws_url, source_ref, target_ws, obj_name): ws = Workspace(ws_url, token=TOKEN) type_ = ws.translate_from_MD5_types([CS_MD5_TYPE])[CS_MD5_TYPE][0] ws.save_objects({ 'workspace': target_ws, 'objects': [{ 'name': obj_name, 'type': type_, 'data': cs, 'provenance': [{ 'script': SCRIPT_NAME, 'script_ver': __VERSION__, 'input_ws_objects': [source_ref], }] }] })
def upload_narrative(nar_file, auth_token, user_id, url=ci_ws, set_public=False): """ Uploads a Narrative from a downloaded object file. This file needs to be in JSON format, and it expects all data and info that is usually returned by the Workspace.get_objects method. Returns a dict of three elements: ws: the id of the workspace that was created obj: the id of the narrative object ref: the above two joined together into an object ref (for convenience) """ # read the file f = open(nar_file, 'r') nar = json.loads(f.read()) f.close() # do some setup. current_nar_metadata = ws_metadata current_nar_metadata['narrative_nice_name'] = nar['data']['metadata']['name'] ws_client = Workspace(url=url, token=auth_token) # create the new workspace for the narrative ws_info = ws_client.create_workspace({ 'workspace': '{}:{}'.format(user_id, str(time.time()).replace('.', '')), 'meta': current_nar_metadata, 'globalread': 'r' if set_public else 'n' }) ws_id = ws_info[0] # setup and save the narrative object metadata = nar['info'][10] ws_save_obj = { 'type': 'KBaseNarrative.Narrative', 'data': nar['data'], 'name': nar['info'][1], 'meta': nar['info'][10], 'provenance': [{ 'script': 'upload_narrative_test.py', 'description': 'Temporary Narrative uploaded for automated testing' }] } obj_info = ws_client.save_objects({'id': ws_id, 'objects': [ws_save_obj]}) # tweak the workspace's metadata to properly present its narrative ws_client.alter_workspace_metadata({'wsi': {'id': ws_id}, 'new': {'narrative': obj_info[0][0]}}) return { 'ws': ws_info[0], 'obj': obj_info[0][0], 'refstr': '{}/{}'.format(ws_info[0], obj_info[0][0]), 'ref': NarrativeRef({'wsid': ws_info[0], 'objid': obj_info[0][0]}) }
def manyHellos_runEach(self, ctx, task): """ :param task: instance of type "ManyHellos_task" -> structure: parameter "msg" of String, parameter "job_number" of Long, parameter "workspace" of String :returns: instance of type "ManyHellos_runEachResult" (runEach()) -> structure: parameter "message" of String """ # ctx is the context object # return variables are: returnVal #BEGIN manyHellos_runEach print("this is manyHellos_runEach...") pprint(["task is ", task]) res = "{0}: {1}".format(task['job_number'], task['msg']) ws_client = Workspace(url=self.config['workspace-url'], token=ctx['token']) res_obj = ws_client.save_objects({ "workspace": task['workspace'], "objects": [{ 'type': 'KBaseReport.Report', "data": { 'objects_created': [], 'text_message': res }, "name": "{0}_{1}.rpt".format(task['msg'], task['job_number']), "meta": {} }] }) res = json.dumps(res_obj) print("exiting manyHellos_runEach(), res is", res) returnVal = {'message': res} #END manyHellos_runEach # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method manyHellos_runEach return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def save_ws_object(obj): """Save an object to the workspace Parameters ---------- obj : dict Object with the fields: type, data, and name. The type must be the full typespec (e.g. 'MetaboliteAtlas.Compound-0.3') Returns ------- id : str Object workspace id """ from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) obj.setdefault('hidden', 0) wks = ws.list_workspaces({'excludeGlobal': 1}) ws_id = [wk[-1] for wk in wks if wk[0] == os.environ['KB_WORKSPACE_ID']][0] save_objects_params = {'id': ws_id, 'objects': [obj]} return ws.save_objects(save_objects_params)[0][-2]
def save_ws_object(obj): """Save an object to the workspace Parameters ---------- obj : dict Object with the fields: type, data, and name. The type must be the full typespec (e.g. 'MetaboliteAtlas.Compound-0.3') Returns ------- id : str Object workspace id """ from biokbase.workspace.client import Workspace ws = Workspace(WS_URL) obj.setdefault("hidden", 0) wks = ws.list_workspaces({"excludeGlobal": 1}) ws_id = [wk[-1] for wk in wks if wk[0] == os.environ["KB_WORKSPACE_ID"]][0] save_objects_params = {"id": ws_id, "objects": [obj]} return ws.save_objects(save_objects_params)[0][-2]
def SetupRNASeqAnalysis(self, ctx, params): # ctx is the context object # return variables are: returnVal # BEGIN SetupRNASeqAnalysis user_token = ctx["token"] ws_client = Workspace(url=self.__WS_URL, token=user_token) out_obj = {k: v for k, v in params.iteritems() if not k in ("ws_id", "genome_id", "annotation_id") and v} pprint(out_obj) if "num_samples" in out_obj: out_obj["num_samples"] = int(out_obj["num_samples"]) if "num_replicates" in out_obj: out_obj["num_replicates"] = int(out_obj["num_replicates"]) if "genome_id" in params and params["genome_id"] is not None: out_obj["genome_id"] = script_util.get_obj_info( self.__LOGGER, self.__WS_URL, [params["genome_id"]], params["ws_id"], user_token )[0] if "annotation_id" in params and params["annotation_id"] is not None: g_ref = script_util.get_obj_info( self.__LOGGER, self.__WS_URL, [params["annotation_id"]], params["ws_id"], user_token )[0] out_obj["annotation_id"] = g_ref self.__LOGGER.info("Uploading RNASeq Analysis object to workspace {0}".format(out_obj["experiment_id"])) try: res = ws_client.save_objects( { "workspace": params["ws_id"], "objects": [ {"type": "KBaseRNASeq.RNASeqAnalysis", "data": out_obj, "name": out_obj["experiment_id"]} ], } ) returnVal = {"workspace": params["ws_id"], "output": out_obj["experiment_id"]} except Exception, e: raise KBaseRNASeqException( "Error Saving the object to workspace {0},{1}".format(out_obj["experiment_id"], e) )
def diff_p_distribution(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN diff_p_distribution try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## loading pvalue distribution FDT pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; pvfdt = OrderedDict(pvfdt) with open(self.PVFDT_FN, 'r') as myfile: pvfdt = json.load(myfile) data_obj_name = "{0}.fdt".format(param['out_figure_object_name']) pvfdt['id'] = data_obj_name fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"} sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : pvfdt, 'name' : data_obj_name}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END diff_p_distribution # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method diff_p_distribution return value ' + 'result is not type dict as required.') # return the results return [result]
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) #raise Exception(stderr) self.logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ] for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))} # parse clustering results cid2genelist = {} cid2stat = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh: glh.readline() # skip header for line in glh: cluster, mcor, msec = line.rstrip().replace('"','').split("\t") cid2stat[cluster]= [mcor, msec] with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.rstrip().replace('"','').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if(len(cid2genelist) < 1) : self.logger.error("Clustering failed") return empty_results("Error: No cluster output", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}}) ## Upload Clusters feature_clusters ={"original_data": "{0}/{1}".format(param['workspace_name'],param['object_name']), "feature_clusters": feature_clusters} ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters', 'data' : feature_clusters, 'name' : (param['out_object_name'])}]}) result = {'workspace_name' : param['workspace_name'], 'out_object_name' : param['out_object_name']} #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]
def run_filter_genes(workspace_service_url=None, param_file = None, level=logging.INFO, logger = None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) cmd_dowload_cvt_tsv = [FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(param['num_features']) if 'num_features' not in param and 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(param['p_value']) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined"); sys.exit(2) #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write(fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) ## Upload FVE from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing cmd_upload_expr = [TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: cmd_upload_expr.append('--genome_object_name') obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref'])) cmd_upload_expr.append(obj_infos[1]) tmp_ws = obj_infos[7] logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1])) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR,FINAL_FN),'r') as et: eo = json.load(et) if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format(expr['description'], " ".join(cmd_coex_filter)) if 'feature_mapping' in expr: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]}) ## Upload FeatureSet fs ={'description':'Differentially expressed genes generated by {0}'.format(" ".join(cmd_coex_filter)), 'elements': {}} with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]})
def filter_BlastOutput(self, ctx, params): # ctx is the context object # return variables are: returnVal #BEGIN filter_BlastOutput user_token=ctx['token'] ws_client=Workspace(url=self.__WS_URL, token=user_token) blast_outputs=ws_client.get_objects([{'name':params['in_id'], 'workspace': params['ws_id']}]) fs ={'elements': {}} fs['description'] = "FeatureSet from BlastOutput by " printedEvalue = False printedEntries = False if 'evalue' in params and params['evalue'] != "": fs['description'] += " E-value:{0}".format(params['evalue']) printedEvalue = True if 'entries' in params and (params['entries'] != "" or params['entries'] > 0): if(printedEvalue): fs['description'] += "," fs['description'] += " # of entries :{0}".format(params['entries']) printedEntries = True if not printedEvalue and not printedEntries: fs['description'] += "no filtering" if len(blast_outputs) != 1: fs['description'] = "No such blast output object was found : {0}/{1}".format(param['workspace_name'], param['object_name']) else: fm = {} f2g = {} for boid in blast_outputs[0]['data']['BlastOutput_iterations']['Iteration']: for hitd in boid['Iteration_hits']['Hit']: print hitd['Hit_def'] ali = hitd['Hit_def'].find('#') if(ali < 0): next fid = hitd['Hit_def'][0:ali] gri = hitd['Hit_def'].find('#', ali+1) if fid not in f2g: f2g[fid] = {} if (gri >= 0 and not gri == (ali+1)): grid = hitd['Hit_def'][(ali+1):gri] f2g[fid][grid] = 1 for hspd in hitd['Hit_hsps']['Hsp']: if fid in fm: if float(hspd['Hsp_evalue']) < fm[fid]: fm[fid] = float(hspd['Hsp_evalue']) else: fm[fid] = float(hspd['Hsp_evalue']) fms = sorted(fm.items(), key=lambda x: x[1], reverse=False) bol = len(fms) if params['entries'] != "" or int(params['entries']) > 0: if(int(params['entries']) < bol): bol = int(params['entries']) for i in range(bol): if(fms[i][1] > float(params['evalue'])): break if fms[i][0] in f2g: fs['elements'][fms[i][0]] = f2g[fms[i][0]].keys() else: fs['elements'][fms[i][0]] = [] ws_client.save_objects( {"workspace":params['ws_id'], "objects": [{ "type":"KBaseCollections.FeatureSet", "data":fs, "name":params['out_id']} ]}) #pprint(fs) returnVal = {'obj_name' : params['out_id'], 'ws_id' : params['ws_id']} #END filter_BlastOutput # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method filter_BlastOutput return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def create_expression_matrix(self, ctx, expressionMatrixParams): # ctx is the context object # return variables are: returnVal #BEGIN create_expression_matrix params = expressionMatrixParams returnVal = params['ws_expression_matrix_id'] #Set up workspace client user_token = ctx['token'] workspace = params['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to get fpkmgenematrix.R # Prepare output object. outjson = False; #outjson = "repfpkmgenematrix.R.matrix.txt.json"; if params['include_replicates'] ==0: scriptfile = "fpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL,workspace) else: scriptfile = "repfpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL,workspace) if outjson is False: self.__LOGGER.info("Creation of expression matrix failed") return returnVal with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et: eo = json.load(et) eo['type']='untransformed' genome_ref = s_res[0]['data']['genome_id'] #eo['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params['ws_expression_matrix_id']) ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo, 'name' : params['ws_expression_matrix_id'] }]}) #END create_expression_matrix # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_expression_matrix return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." } ] #TODO.. Giving Rplot.pdf # { 'file': "dendrogramplot.R", # 'title': "Dendrogram", # 'description': "Dendrogram based on the JS (Jensen-Shannon divergence) distance" }, # # { 'file': "dendrogramrepplot.R", # 'title': "Dendrogram including replicates", # 'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" }, # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace_name'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def create_expression_matrix(self, ctx, expressionMatrixParams): # ctx is the context object # return variables are: returnVal #BEGIN create_expression_matrix params = expressionMatrixParams returnVal = params['ws_expression_matrix_id'] #Set up workspace client user_token = ctx['token'] workspace = params['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to get fpkmgenematrix.R # Prepare output object. outjson = False #outjson = "repfpkmgenematrix.R.matrix.txt.json"; if params['include_replicates'] == 0: scriptfile = "fpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace) else: scriptfile = "repfpkmgenematrix.R" outjson = script_util2.generate_and_upload_expression_matrix( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, scriptfile, self.__SHOCK_URL, self.__HS_URL, user_token, cuffdiff_dir, self.__WS_URL, workspace) if outjson is False: self.__LOGGER.info("Creation of expression matrix failed") return returnVal with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et: eo = json.load(et) eo['type'] = 'untransformed' genome_ref = s_res[0]['data']['genome_id'] #eo['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + params['ws_expression_matrix_id']) ws_client.save_objects({ 'workspace': workspace, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': eo, 'name': params['ws_expression_matrix_id'] }] }) #END create_expression_matrix # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_expression_matrix return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
'remote_md5':reverse_shock_file['file']['checksum']['md5'] }, 'encoding':'UTF8', 'type':'fastq', 'size':reverse_shock_file['file']['size'] }, 'interleaved':0, 'sequencing_tech':'artificial reads' } ws = Workspace(WORKSPACE_URL, token=token) new_obj_info = ws.save_objects({ 'workspace':'msneddon:1448037540898', 'objects':[ { 'type':'KBaseFile.PairedEndLibrary', 'data':paired_end_library, 'name':'test.reads', 'meta':{}, 'provenance':[ { 'service':'MegaHit', 'method':'test_megahit' } ] }] }) pprint(new_obj_info)
def calculate(self, ctx, input): # ctx is the context object # return variables are: output #BEGIN calculate ''' Compute reaction probabilities from a probabilistic annotation. The input dictionary must contain the following keys: probanno: Name of ProbAnno object to input probanno_workspace: Workspace from which to grab the ProbAnno object rxnprobs: Name of RxnProbs object rxnprobs_workspace: Workspace to which to save the RxnProbs object The following keys are optional: verbose: Print lots of messages on the progress of the algorithm template_model: Name of TemplateModel object template_workspace: Workspace from which to grab TemplateModel object @param ctx Current context object @param input Dictionary with input parameters for function @return Object info for RxnProbs object @raise WrongVersionError when ProbAnno object version number is invalid @raise ValueError when template_workspace input argument is not specified ''' # Sanity check on input arguments input = self._checkInputArguments(ctx, input, ["probanno", "probanno_workspace", "rxnprobs", "rxnprobs_workspace"], { "verbose" : False , "template_model" : None, "template_workspace" : None } ) # Make sure the static database files are ready. self._checkDatabaseFiles(ctx) # Set log level to INFO when verbose parameter is enabled. if input['verbose']: ctx.set_log_level(log.DEBUG) # Create a workspace client. wsClient = Workspace(self.config["workspace_url"], token=ctx['token']) # Get the ProbAnno object from the specified workspace. probannoObjectId = make_object_identity(input["probanno_workspace"], input["probanno"]) objectList = wsClient.get_objects( [ probannoObjectId ] ) probannoObject = objectList[0] if probannoObject['info'][2] != ProbAnnoType: message = "ProbAnno object type %s is not %s for object %s" %(probannoObject['info'][2], ProbAnnoType, probannoObject['info'][1]) ctx.log_err(message) raise WrongVersionError(message) genome = probannoObject["data"]["genome"] # Create a temporary directory for storing intermediate files when debug is turned on. if ctx.get_log_level() >= log.DEBUG2: workFolder = tempfile.mkdtemp("", "calculate-%s-" %(genome), self.config["work_folder_path"]) ctx.log_debug('Intermediate files saved in '+workFolder) else: workFolder = None # When a template model is specified, use it to build dictionaries for roles, # complexes, and reactions instead of retrieving from static database files. complexesToRoles = None reactionsToComplexes = None if input["template_model"] is not None or input["template_workspace"] is not None: if not(input["template_model"] is not None and input["template_workspace"] is not None) : message = "Template model workspace is required if template model ID is provided" ctx.log_err(message) raise ValueError(message) # Create a dictionary to map a complex to a list of roles and a dictionary # to map a reaction to a list of complexes. The dictionaries are specific to # the specified template model instead of covering everything in the central # data model. complexesToRoles = dict() reactionsToComplexes = dict() # Get the list of RoleComplexReactions for the template model from the # fba modeling service. The RoleComplexReactions structure has a list # of ComplexReactions structures for the given role. And each ComplexReactions # structure has a list of reactions for the given complex. fbaClient = fbaModelServices(self.config['fbamodeling_url'], token=ctx['token']) roleComplexReactionsList = fbaClient.role_to_reactions( { 'templateModel': input['template_model'], 'workspace': input['template_workspace'] } ) # Build the two dictionaries from the returned list. for rcr in roleComplexReactionsList: for complex in rcr['complexes']: complexId = re.sub(r'cpx0*(\d+)', r'kb|cpx.\1', complex['name']) # Convert ModelSEED format to KBase format if complexId in complexesToRoles: complexesToRoles[complexId].append(rcr['name']) else: complexesToRoles[complexId] = [ rcr['name'] ] for reaction in complex['reactions']: reactionId = reaction['reaction'] if reactionId in reactionsToComplexes: reactionsToComplexes[reactionId].append(complexId) else: reactionsToComplexes[reactionId] = [ complexId ] # Calculate per-gene role probabilities. roleProbs = self._rolesetProbabilitiesToRoleProbabilities(ctx, input, genome, probannoObject["data"]["roleset_probabilities"], workFolder) # Calculate whole cell role probabilities. # Note - eventually workFolder will be replaced with a rolesToReactions call totalRoleProbs = self._totalRoleProbabilities(ctx, input, genome, roleProbs, workFolder) # Calculate complex probabilities. complexProbs = self._complexProbabilities(ctx, input, genome, totalRoleProbs, workFolder, complexesToRequiredRoles = complexesToRoles) # Calculate reaction probabilities. reactionProbs = self._reactionProbabilities(ctx, input, genome, complexProbs, workFolder, rxnsToComplexes = reactionsToComplexes) # If the reaction probabilities were not calculated using the data from the fba modeling service # via the template model, we need to convert from the KBase ID format to the ModelSEED format. if input["template_model"] is None: reactionList = list() for index in range(len(reactionProbs)): reactionList.append(reactionProbs[index][0]) EntityAPI = CDMI_EntityAPI(self.config["cdmi_url"]) numAttempts = 4 while numAttempts > 0: try: numAttempts -= 1 reactionData = EntityAPI.get_entity_Reaction( reactionList, [ "source_id" ] ) if len(reactionList) == len(reactionData): numAttempts = 0 except HTTPError as e: pass for index in range(len(reactionProbs)): rxnId = reactionProbs[index][0] reactionProbs[index][0] = reactionData[rxnId]['source_id'] # Create a reaction probability object objectData = dict() objectData["genome"] = probannoObject["data"]["genome"] objectData['genome_workspace'] = probannoObject['data']['genome_workspace'] if input["template_model"] is None: objectData['template_model'] = 'None' else: objectData["template_model"] = input["template_model"] if input["template_workspace"] is None: objectData['template_workspace'] = 'None' else: objectData["template_workspace"] = input["template_workspace"] objectData["probanno"] = input['probanno'] objectData['probanno_workspace'] = input['probanno_workspace'] objectData["id"] = input["rxnprobs"] objectData["reaction_probabilities"] = reactionProbs objectMetaData = { "num_reaction_probs": len(objectData["reaction_probabilities"]) } objectProvData = dict() objectProvData['time'] = timestamp(0) objectProvData['service'] = os.environ['KB_SERVICE_NAME'] objectProvData['service_ver'] = ServiceVersion objectProvData['method'] = 'calculate' objectProvData['method_params'] = input.items() objectProvData['input_ws_objects'] = [ '%s/%s/%d' %(probannoObject['info'][7], probannoObject['info'][1], probannoObject['info'][4]) ] objectSaveData = dict(); objectSaveData['type'] = RxnProbsType objectSaveData['name'] = input["rxnprobs"] objectSaveData['data'] = objectData objectSaveData['meta'] = objectMetaData objectSaveData['provenance'] = [ objectProvData ] objectInfo = wsClient.save_objects( { 'workspace': input["rxnprobs_workspace"], 'objects': [ objectSaveData ] } ) output = objectInfo[0] #END calculate # At some point might do deeper type checking... if not isinstance(output, list): raise ValueError('Method calculate return value ' + 'output is not type list as required.') # return the results return [output]
def setUp(cls): token = environ.get('KB_AUTH_TOKEN', None) if token is None: sys.stderr.write( "Error: Unable to run tests without authentication token!\n") sys.exit(1) token_file = open('ltest/script_test/token.txt', 'w') token_file.write(token) config_file = environ.get('KB_DEPLOYMENT_CONFIG', None) cls.cfg = {} config = ConfigParser() config.read(config_file) for nameval in config.items('CoExpression'): cls.cfg[nameval[0]] = nameval[1] auth_service_url = cls.cfg.get( 'auth-service-url', "https://kbase.us/services/authorization/Sessions/Login") ws_url = cls.cfg['ws_url'] auth_service_url_allow_insecure = cls.cfg[ 'auth-service-url-allow-insecure'] auth_client = _KBaseAuth(auth_service_url) user_id = auth_client.get_user(token) ws = Workspace( url=ws_url, token=token, auth_svc=auth_service_url, trust_all_ssl_certificates=auth_service_url_allow_insecure) # update input data in reverse order of references ordered_file_list = [ INPUT_META_DATA_DIR + '/test_diff_p_distribution_input_ref2.json', INPUT_META_DATA_DIR + '/test_diff_p_distribution_input_ref1.json', INPUT_META_DATA_DIR + '/test_diff_p_distribution_input.json', INPUT_META_DATA_DIR + '/test_view_heatmap_input_ref1.json', INPUT_META_DATA_DIR + '/test_view_heatmap_input.json', INPUT_META_DATA_DIR + '/test_coex_clust_input.json', INPUT_META_DATA_DIR + '/test_filter_genes_input.json' ] for filename in ordered_file_list: with open(filename, 'r') as infile: input_meta_data = json.load(infile) # create workspace that is local to the user if it does not exist workspace_name_t = Template( str(input_meta_data['params'][0]['workspace_name'])) workspace_name = workspace_name_t.substitute(user_id=user_id) print('workspace_name: ' + workspace_name) try: ws_info = ws.get_workspace_info({'workspace': workspace_name}) print("workspace already exists: " + str(ws_info)) except: ws_info = ws.create_workspace({ 'workspace': workspace_name, 'description': 'Workspace for ' + str(input_meta_data['method']) }) print("Created new workspace: " + str(ws_info)) print('reading input file: ' + filename) object_name = str(input_meta_data['params'][0]['object_name']) print('object_name: ' + object_name) input_data_filename = INPUT_DATA_DIR + '/' + object_name + '.json' print('input data filename: ' + input_data_filename) with open(input_data_filename, 'r') as infile: input_data = json.load(infile) # update workspace name in input data input_data_str = json.dumps(input_data) input_data_t = Template(input_data_str) input_data_str = input_data_t.substitute( workspace_name=workspace_name) input_data = json.loads(input_data_str) print('type: ' + input_data[0]['info'][2]) #upload data (no effect if data already exists in workspace) print('uploading input data to workspace') ws.save_objects({ 'workspace': workspace_name, 'objects': [{ 'type': input_data[0]['info'][2], 'data': input_data[0]['data'], 'name': object_name }] }) print('ws objects: ' + str(ws.list_objects({'workspaces': [workspace_name]})))
parser.add_argument('-o', '--out_id', help='Output workspace object name', action='store', dest='outobj_id', default=None, required=True) parser.add_argument('-l', '--support_dir', help='Support directory', action='store', dest='sdir', default='lib', required=True) parser.add_argument('-g', '--out_file', help='Output prefix or file name', action='store', dest='otmp', default='outfile', required=True) # for meta data parser.add_argument('-i', '--in_id', help='Input Shock node id for meta', action='store', dest='inobj_id', default='NotProvided', required=True) parser.add_argument('-e', '--ext_type', help='External object type', action='store', dest='etype', default=None, required=True) parser.add_argument('-j', '--job_id', help='UJS job id', action='store', dest='jid', default='NoJodID', required=False) usage = parser.format_usage() parser.description = desc1 + ' ' + usage + desc2 parser.usage = argparse.SUPPRESS args = parser.parse_args() kb_token = os.environ.get('KB_AUTH_TOKEN') ## main loop jif = open("{}/{}".format(args.sdir,args.otmp, 'r')) data = json.loads(jif.read()) jif.close() wsd = Workspace(url=args.ws_url, token=kb_token) wsd.save_objects({'workspace':args.ws_id, 'objects' : [ { 'type' : 'Transform.Pair', 'data' : data, 'name' : args.outobj_id, 'meta' : { 'source_id' : args.inobj_id, 'source_type' : args.etype, 'ujs_job_id' : args.jid} } ]}) exit(0);
def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams): """ :param heatmapParams: instance of type "heatmapParams" -> structure: parameter "workspace" of String, parameter "sample1" of String, parameter "sample2" of String, parameter "q_value_cutoff" of Double, parameter "log2_fold_change_cutoff" of Double, parameter "num_genes" of Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) :returns: instance of type "ResultsToReport" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes_old fparams = heatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : fparams['ws_cuffdiff_id'], 'workspace' : fparams['workspace'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/cuffdiffData/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['pairs']=1 fparams['logModetmp'] = 2 rparams = {} rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join (system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['pairs'] = fparams ['pairs'] rparams['logMode'] = fparams['logModetmp'] rparams['removezeroes'] = 1 rparams['outmatrix'] = join (system_params['scratch'], "outmatrix") reportObj = {} provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] # add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects']=[workspace+'/'+fparams['ws_cuffdiff_id']] report = "" if (fparams['pairs'] != 0): try: filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) rparams['genelist'] = filtered_matrix except: report += "There was an error in creating expression matrix" report += "No differentially expressed genes were found" report += "Please change / double check your filtering criteria" reportObj = { 'objects_created':[], 'text_message':report } reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } return [returnVal] try: # Prepare output object. outjson = False; roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp' : fparams['ws_expression_matrix_id'] } ] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) report = "Error: Please select a different cutoff criteria. None of the genes passed fold change and q-value-cutoff. " report += "Failed to create expression matrix with differentially expressed genes(" + fparams['ws_expression_matrix_id'] + "). No genes to show on heatmap." reportObj = { 'objects_created':[], 'text_message':report } reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } return [returnVal] else: self.__LOGGER.info(status) outjson = status self.__LOGGER.info('5') with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type']='log2_level' eo2['genome_ref'] = genome_ref self.__LOGGER.info('3') self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) try: res = ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo2, 'name' : plot['exp'] }]}) except: self.__LOGGER ("xxxx6") except: self.__LOGGER.info('6') report = "Successfully created expression matrix" reportObj = { 'objects_created':[], 'text_message':report } self.__LOGGER.info('7') reportName = 'create_interactive_heatmap_de_genes_old_'+str(hex(uuid.getnode())) report_info = ws_client.save_objects({ 'workspace':fparams['workspace'], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, # important! make sure the report is hidden 'provenance':provenance } ] })[0] print('saved Report: '+pformat(report_info)) returnVal = { "report_name" : reportName,"report_ref" : str(report_info[6]) + '/' + str(report_info[0]) + '/' + str(report_info[4]) } #END create_interactive_heatmap_de_genes_old # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method create_interactive_heatmap_de_genes_old return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal]
def generate_cummerbund_plot2(self, ctx, cummerbundstatParams): """ :param cummerbundstatParams: instance of type "cummerbundstatParams" -> structure: parameter "workspace" of String, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output), parameter "ws_diffstat_output" of type "ws_diffstat_output" (Differential stat workspace id) :returns: instance of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) """ # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plot2 params = cummerbundstatParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace'] }]) print "Getting genome info" genome_ref = s_res[0]['data']['genome_id'] #genome_ref = '2702/6/2' #genome_ref = '2702/26/1' #genome_ref = '2229/21/10' print genome_ref gaapi = GenomeAnnotationAPI(self.callbackURL, token=user_token) genome = gaapi.get_genome_v1({"genomes": [{"ref": genome_ref}], "included_fields": ["scientific_name"], "included_feature_fields": ["id", "function", "type" ]})["genomes"][0]["data"] genome_dict = {} features = genome['features'] for feature in features: id = feature['id'] try: function = feature['function'] if not function: function = 'Unknown' except: function = 'Unknown' genome_dict[id] = function # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." } ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) infile = join(cuffdiff_dir, "gene_exp.diff") outfile = join(cuffdiff_dir, "gene_exp_diff.out") x=v.volcano_plot_data_parse_and_upload(infile,outfile, genome_dict) with open(outfile) as f: statdata = json.load(f) res = ws_client.save_objects({ "workspace":params['workspace'], "objects": [{ "type":"KBaseRNASeq.DifferentialExpressionStat", "data":statdata, "name":params["ws_diffstat_output"]}] }) #END generate_cummerbund_plot2 # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plot2 return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def run_coex_cluster(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_cluster2 Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(CLSTR_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) #raise Exception(stderr) logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [ COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(CLSTR_DIR, CLSTR_FN) ] for p in [ 'net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight' ]: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search( r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): logger.info(stderr) else: logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index = { expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids'])) } # parse clustering results cid2genelist = {} with open("{0}/{1}".format(CLSTR_DIR, CLSTR_FN), 'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.replace('"', '').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if (len(cid2genelist) < 1): logger.error("Clustering failed") return empty_results("Error: No cluster output", expr, workspace_service_url, param, logger, ws) #sys.exit(4) logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append({ "id_to_pos": {gene: pos_index[gene] for gene in cid2genelist[cluster]} }) ## Upload Clusters feature_clusters = { "original_data": "{0}/{1}".format(param['workspace_name'], param['object_name']), "feature_clusters": feature_clusters } ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.FeatureClusters', 'data': feature_clusters, 'name': (param['out_object_name']) }] })
def mys_example(args): ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) indata = wsd.get_object({ 'id': args.inobj_id, #'type' : 'KBaseExpression.ExpressionSeries', 'workspace': args.ws_id })['data'] if indata is None: raise Exception("Object {} not found in workspace {}".format( args.inobj_id, args.ws_id)) ### # execute filtering flt_cmd_lst = [ 'mys_example', "-i", "{}-{}".format(os.getpid(), args.exp_fn) ] if (args.method is not None): flt_cmd_lst.append('-m') flt_cmd_lst.append(args.method) if (args.p_value is not None): flt_cmd_lst.append('-p') flt_cmd_lst.append(args.p_value) if (args.num_genes is not None): flt_cmd_lst.append('-n') flt_cmd_lst.append(args.num_genes) if (args.flt_out_fn is not None): flt_cmd_lst.append('-o') flt_cmd_lst.append("{}-{}".format(os.getpid(), args.flt_out_fn)) p1 = Popen(flt_cmd_lst, stdout=PIPE) out_str = p1.communicate() # print output message for error tracking if out_str[0] is not None: print out_str[0] if out_str[1] is not None: print >> sys.stderr, out_str[1] flt_cmd = " ".join(flt_cmd_lst) ### # put it back to workspace #fif = open("{}-{}".format(os.getpid(),args.flt_out_fn), 'r') #fif.readline(); # skip header # assume only one genome id outdata = {} outdata['key'] = indata['key'] outdata['value'] = "{}{}".format(indata['value'], indata['value']) data_list = [] data_list.append({ 'type': 'MyService.PairString', 'data': outdata, 'name': args.outobj_id, 'meta': { 'org.series': args.inobj_id } }) wsd.save_objects({'workspace': args.ws_id, 'objects': data_list}) if (args.del_tmps is "true"): os.remove("{}-{}".format(os.getpid(), args.exp_fn)) os.remove("{}-{}".format(os.getpid(), args.flt_out_fn))
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': params['ws_cuffdiff_id'], 'workspace': params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) if (cuffdiff_dir is False): return returnVal # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [{ 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot is the quality measure of the data. It estimates deviation from threshold against counts in FPKM." }, { 'file': "fpkmscvplot.R", 'title': "Genes CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data." }, { 'file': "isoformscvplot.R", 'title': "Isoform CV plot", 'description': "The squared coefficient of variation plot is a normalized measure of cross-replicate variability that can be useful for evaluating the quality of RNA-seq data.Differences in CV2 can result in lower numbers of differentially expressed isoforms due to a higher degree of variability between replicate fpkm estimates." }, { 'file': "densityplot.R", 'title': "Density plot", 'description': "The density plot shows the distribution of FPKM scores across samples" }, { 'file': "csdensityrepplot.R", 'title': "Replicates density plot", 'description': "The replicates density plot shows the distribution of FPKM scores across sample replicates" }, { 'file': "boxplot.R", 'title': "Box plots", 'description': "The box plots show the FPKM distribution across samples." }, { 'file': "boxrepplot.R", 'title': "Box plots of replicates", 'description': "The box plots of replicates show the FPKM distribution across sample replicates." }, { 'file': "pairwisescatterplots.R", 'title': "Pairwise scatter plots", 'description': "The scatterplots show differences in gene expression between two samples. If two samples are identical, all genes will fall on the mid-line." }, { 'file': "volcanomatrixplot.R", 'title': "Volcano matrix plots", 'description': "Volcano matrix plot is a scatter plot that also identifies differentially expressed genes (by color) between samples based on log2 fold change cut off." }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions." }, { 'file': "pcarepplot.R", 'title': "PCA plot including replicates", 'description': "Principal Component Analysis (PCA) is an informative approach for dimensionality reduction for exploring teh relationship between sample conditions including replicates." }, { 'file': "mdsplot.R", 'title': "Multi-dimensional scaling plot", 'description': "Multi-dimensional scaling plots are similar to PCA plots and useful for determining the major sources of variation in the dataset. " }, { 'file': "mdsrepplot.R", 'title': "Multi-dimensional scaling plot including replicates", 'description': "Multi-dimensional scaling plot including replicates are similar to PCA plots and useful for determining the major sources of variation in the dataset with replicates. These can be useful to determine any systematic bias that may be present between conditions." }] #TODO.. Giving Rplot.pdf # { 'file': "dendrogramplot.R", # 'title': "Dendrogram", # 'description': "Dendrogram based on the JS (Jensen-Shannon divergence) distance" }, # # { 'file': "dendrogramrepplot.R", # 'title': "Dendrogram including replicates", # 'description': "Dendrogram including replicates based on the JS (Jensen-Shannon divergence) distance" }, # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload( self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info( "Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace": params['workspace_name'], "objects": [{ "type": "KBaseRNASeq.cummerbund_output", "data": outputobject, "name": params["ws_cummerbund_output"] }] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
#!/usr/bin/env python from biokbase.workspace.client import Workspace import os, sys, json Token = os.environ['KB_AUTH_TOKEN'] Workspace_URL = 'https://appdev.kbase.us/services/ws' WSClient = Workspace(url=Workspace_URL, token=Token) print(WSClient.ver()) with open('MSD_v1.0_Biochem.json', "r") as read_file: data = json.load(read_file) results = WSClient.save_objects({ 'workspace': 'kbase', 'objects': [{ 'type': 'KBaseBiochem.Biochemistry', 'data': data, 'name': 'default' }] })
def gl2networks (args) : ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) raw_data = wsd.get_object({'id' : args.inobj_id, 'workspace' : args.ws_id})['data'] gl = [ gr[2] for gr in raw_data['genes']] gl_str = "'" + "','".join(gl)+ "'" sql = "SELECT DISTINCT af1.to_link, af2.to_link, f1.source_id, f2.source_id, af1.strength, ig.from_link FROM IsGroupingOf ig, AssociationFeature af1, AssociationFeature af2, Feature f1, Feature f2 WHERE ig.to_link = af1.from_link and af1.from_link = af2.from_link and (af1.to_link IN ({}) AND af2.to_link IN ({}) ) AND af1.to_link < af2.to_link AND f1.id = af1.to_link AND f2.id = af2.to_link".format(gl_str, gl_str) nc = Node() datasets = []; try: con = mdb.connect(args.db_host, args.db_user, args.db_pass, args.db_name); cur = con.cursor() cur.execute(sql) edge = cur.fetchone() dsid = set() while( edge is not None): nc.add_edge(edge[4], edge[5], edge[0], 'GENE', edge[1], 'GENE', 0.0, edge[2], edge[3]); dsid.add(edge[5]); edge = cur.fetchone() ds_str = "'" + "','".join(dsid)+ "'" cur.execute("SELECT id, association_type, data_source, description , df.to_link, sr.from_link FROM AssociationDataset, IsDatasetFor df, IsSourceForAssociationDataset sr WHERE id = df.from_link and id = sr.to_link and id IN({})".format(ds_str)) ds = cur.fetchone() while( ds is not None): datasets.append ( { 'network_type' : ds[1], 'taxons' : [ ds[4] ], 'source_ref' : ds[5], 'name' : ds[0], 'id' : ds[0], 'description' : ds[3], 'properties' : { } }) ds = cur.fetchone() # generate Networks object net_object = { 'datasets' : datasets, 'nodes' : nc.nodes, 'edges' : nc.edges, 'user_annotations' : {"genes" :",".join(gl) }, 'name' : 'GeneList Internal Network', 'id' : args.outobj_id, 'properties' : { 'graphType' : 'edu.uci.ics.jung.graph.SparseMultigraph' } } # Store results object into workspace wsd.save_objects({'workspace' : args.ws_id, 'objects' : [{'type' : 'KBaseNetworks.Network', 'data' : net_object, 'name' : args.outobj_id, 'meta' : {'org_obj_id' : args.inobj_id, 'org_ws_id' : args.ws_id}}]}) except mdb.Error, e: print "Error %d: %s" % (e.args[0],e.args[1]) sys.exit(1)
usage = parser.format_usage() parser.description = desc1 + ' ' + usage + desc2 parser.usage = argparse.SUPPRESS args = parser.parse_args() kb_token = os.environ.get('KB_AUTH_TOKEN') ## main loop jif = open("{}/{}".format(args.sdir, args.otmp, 'r')) data = json.loads(jif.read()) jif.close() wsd = Workspace(url=args.ws_url, token=kb_token) wsd.save_objects({ 'workspace': args.ws_id, 'objects': [{ 'type': 'Transform.Pair', 'data': data, 'name': args.outobj_id, 'meta': { 'source_id': args.inobj_id, 'source_type': args.etype, 'ujs_job_id': args.jid } }] }) exit(0)
def create_interactive_heatmap_de_genes(self, ctx, interactiveHeatmapParams): # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes fparams = interactiveHeatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : fparams['ws_cuffdiff_id'], 'workspace' : fparams['workspace_name'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join (self.__SCRATCH , "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data (self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join (cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join (system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) # Prepare output object. outjson = False; rparams = {} rparams['genelist'] = filtered_matrix rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join (system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['outmatrix'] = join (system_params['scratch'], "outmatrix") roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic (rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp' : fparams['ws_expression_matrix_id'] } ] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive(system_params,fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status with open("{0}/{1}".format(self.__SCRATCH , outjson),'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type']='untransformed' #eo2['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) ws_client.save_objects({'workspace' : workspace, 'objects' : [{ 'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : eo2, 'name' : plot['exp'] }]}) returnVal = fparams['ws_expression_matrix_id'] #END create_interactive_heatmap_de_genes # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method create_interactive_heatmap_de_genes return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
from biokbase.workspace.client import Workspace ws_client = Workspace() ws_next_client = Workspace(url='https://next.kbase.us/services/ws') a, b = ws_next_client.get_objects([{'objid' : '4', 'wsid' : '68'}, {'objid' : '5', 'wsid' : '68'}])[0:2] a_params = {'type' : a['info'][2], 'data': a['data']} b_params = {'type' : b['info'][2], 'data': b['data']} ws_client.save_objects({'id': '9145', 'objects': [a_params, b_params]})
kb_token = os.environ.get('KB_AUTH_TOKEN') wsd = Workspace(url=args.ws_url, token=kb_token) gids = [ re.sub(r"_ContigSet.jsonp$","", f) for f in listdir(".") if f.endswith("_ContigSet.jsonp")] ## main loop for gid in gids: # store contigset first jif = open("{}/{}_ContigSet.jsonp".format(".",gid, 'r')) data = json.loads(jif.read()) jif.close() wsd.save_objects({'workspace':args.ws_id, 'objects' : [ { 'type' : 'KBaseGenomes.ContigSet', 'data' : data, 'name' : "{}-{}_cs".format(args.outobj_id, gid), 'meta' : { 'source_id' : args.inobj_id, 'source_type' : args.etype, 'ujs_job_id' : args.jid} } ]}) jif = open("{}/{}.jsonp".format(".",gid, 'r')) data = json.loads(jif.read()) jif.close() data['contigset_ref'] = "{}/{}-{}_cs".format(args.ws_id,args.outobj_id, gid) wsd.save_objects({'workspace':args.ws_id, 'objects' : [ { 'type' : 'KBaseGenomes.Genome', 'data' : data, 'name' : "{}-{}_gn".format(args.outobj_id, gid), 'meta' : { 'source_id' : args.inobj_id, 'source_type' : args.etype, 'ujs_job_id' : args.jid} } ]}) exit(0);
def run_filter_genes(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if (ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [ COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y' ] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined") return empty_results("One of p_value or num_features must be defined", expr, workspace_service_url, param, logger, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction try: with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write( fl ) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) except: logger.error("Output was not found") return empty_results("Increase p_value or specify num_features", expr, workspace_service_url, param, logger, ws) ## checking genelist with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if (len(gl) < 1): logger.error("No genes are selected") return empty_results("Increase p_value or specify num_features", expr, workspace_service_url, param, logger, ws) #sys.exit(4) ## Upload FVE # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing # Updates: change missing genome handling strategy by copying reference to working workspace cmd_upload_expr = [ TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: obj_infos = ws.get_object_info_new( {"objects": [{ 'ref': expr['genome_ref'] }]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format( expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format( expr['genome_ref'])) #tmp_ws = "{0}".format(obj_infos[7]) logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1])) if obj_infos[7] != param['workspace_name']: #we need to copy it from the other workspace try: logger.info( "trying to copy the referenced genome object : {0}".format( expr['genome_ref'])) ws.copy_object({ 'from': { 'ref': expr['genome_ref'] }, 'to': { 'workspace': param['workspace_name'], 'name': obj_infos[1] } }) # add genome_object_name only after successful copy cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) except: # no permission or any issues... then, give up providing genome reference logger.info("".join(traceback.format_exc())) pass else: # it is local... we can simply add reference without copying genome cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) logger.info(" ".join(cmd_upload_expr)) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et: eo = json.load(et) if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format( expr['description'], param['method']) if 'feature_mapping' in expr and 'feature_mapping' in eo: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': expr, 'name': (param['out_expr_object_name']) }] }) ## Upload FeatureSet fs = {'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format( param['method']) fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': fs, 'name': (param['out_fs_object_name']) }] })
reagent['compartment_ref'] = '~/compartments/id/' + cpd[ 'compartmentId'] reagent['coefficient'] = cpd['stoich'] reagent[ 'isCofactor'] = 0 # @todo Is this set separately from value in compound? rxn['reagents'].append(reagent) del rxn['equation'] # Remove after converting to reagent format biochem['reactions'].append(rxn) # Add the compartments from the compartments file. Required fields: id, name, # and hierarchy. compartments = helper.readCompartmentsFile(args.compartmentfile, includeLinenum=False) for index in range(len(compartments)): biochem['compartments'].append(compartments[index]) # Save the Biochemistry object to the specified workspace. wsClient = Workspace(args.wsurl) objectSaveData = dict() objectSaveData['type'] = 'KBaseBiochem.Biochemistry-4.0' objectSaveData['name'] = args.id objectSaveData['data'] = biochem # objectSaveData['meta'] = objectMetaData # objectSaveData['provenance'] = [ objectProvData ] objectInfo = wsClient.save_objects({ 'workspace': args.workspace, 'objects': [objectSaveData] }) exit(0)
def view_heatmap(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN view_heatmap try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Loading data") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) fc = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] if 'original_data' not in fc: raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix") oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0] df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids']) # L2 normalization df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0) # type - ? level, ratio, log-ratio <---> "untransformed" # scale - ? probably: raw, ln, log2, log10 self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] )) # do default behavior factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) # now fc_df will be reset if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass elif oexpr['data']['type'] == 'ratio': fc_df = df2.apply(np.log2) elif oexpr['data']['type'] == 'log-ratio': fc_df = df2 if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass else: # do the same thing with simple level or untransformed if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass self.logger.info("Compute cluster statistics") cl = {} afs = []; cid = 1; c_stat = pd.DataFrame() for cluster in fc['feature_clusters']: try: fs = cluster['id_to_pos'].keys() except: continue # couldn't find feature_set fsn = "Cluster_{0}".format(cid) cid +=1 c_stat.loc[fsn,'size'] = len(fs) if 'meancor' in cluster: c_stat.loc[fsn,'mcor'] = cluster['meancor'] else: pass # TODO: Add mean cor calculation later #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN if 'quantile' in param: # enforcing quantile to be in [0 .. 1] rnage qt = float(param['quantile']) if qt > 1.0: qt = 1.0 if qt < 0.0: qt = 0.0 c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(qt) else: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75) c1 = df3.loc[fs,].sum(axis=0) if df3.loc[fs,].shape[0] < 1: # empty continue cl[fsn] = fs #afs.extend(fs) #c1 = df3.loc[fs,].sum(axis=0) #c1 = c1 / np.sqrt(c1.pow(2).sum()) #if(len(cl.keys()) == 1): # centroids = c1.to_frame(fsn).T #else: # centroids.loc[fsn] = c1 # now we have centroids and statistics # let's subselect clusters min_features = 200 if 'min_features' in param : min_features = param['min_features'] c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max() c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max() if 'use_norm_weight' in param and param['use_norm_weight'] != 0: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0 * c_stat.loc[:,'nstdstat'] else: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1 * c_stat.loc[:,'stdstat'] c_stat.sort_values('weight', inplace=True, ascending=False) pprint(c_stat) centroids = pd.DataFrame() for i in range(c_stat.shape[0]): fsn = c_stat.index[i] fs = cl[fsn] if i != 0 and len(afs) + len(fs) > min_features : break; afs.extend(fs) c1 = df3.loc[fs,].sum(axis=0) c1 = c1 / np.sqrt(c1.pow(2).sum()) if(centroids.shape[0] < 1): centroids = c1.to_frame(fsn).T else: centroids.loc[fsn] = c1 pprint(centroids) if len(cl.keys()) == 0: raise Exception("No feature ids were mapped to dataset or no clusters were selected") # dataset centroid dc = df3.loc[afs,].sum(axis=0) dc = dc / np.sqrt(dc.pow(2).sum()) self.logger.info("Ordering Centroids and Data") # the most far away cluster centroid from dataset centroid fc = (centroids * dc).sum(axis=1).idxmin() # the most far away centroid centroid from fc ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin() # major direction to order on unit ball space md = centroids.loc[ffc,] - centroids.loc[fc,] # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all) corder = (centroids * md).sum(axis=1).sort_values() # cluster order coidx = corder.index dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order # get first fs table fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []} fig_properties['ygtick_labels'] = coidx.tolist() if 'fold_change' in param and param['fold_change'] == 1: frange = 2 if 'fold_change_range' in param: frange = float(param['fold_change_range']) final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) if 'fold_cutoff' in param and param['fold_cutoff'] == 1: final[final > frange] = frange final[final < - frange] = - frange else: fc_df0b = final.sub(final.min(axis=1), axis=0) final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange else: final=df2.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = df2.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) ## loading pvalue distribution FDT fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; #fdt = OrderedDict(fdt) # Nan to None final = final.where(pd.notnull(final),None) fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose fdt['row_labels'] = final.columns.tolist() fdt['column_labels'] = final.index.tolist() # TODO: Add group label later fdt['id'] = "{0}.fdt".format(param['out_figure_object_name']) self.logger.info("Saving the results") sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : fdt, 'hidden':1, 'name' : "{0}.fdt".format(param['out_figure_object_name'])}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, #'hidden':1, 'name' : "{0}".format(param['out_figure_object_name'])}]}) #'name' : "{0}.fp".format(param['out_figure_object_name'])}]}) #mchp = {} #mchp['figure_obj'] = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) #sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.MulticlusterHeatmapPlot', # 'data' : mchp, # 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END view_heatmap # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method view_heatmap return value ' + 'result is not type dict as required.') # return the results return [result]
def create_interactive_heatmap_de_genes_old(self, ctx, heatmapParams): """ :param heatmapParams: instance of type "heatmapParams" -> structure: parameter "sample1" of String, parameter "sample2" of String, parameter "q_value_cutoff" of Double, parameter "log2_fold_change_cutoff" of Double, parameter "num_genes" of Long, parameter "ws_cuffdiff_id" of type "ws_cuffdiff_id" (@id ws KBaseRNASeq.RNASeqCuffdiffdifferentialExpression), parameter "ws_expression_matrix_id1" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_expression_matrix_id2" of type "ws_expression_matrix_id" (@id ws KBaseFeatureValues.ExpressionMatrix), parameter "ws_cummerbund_output" of type "ws_cummerbund_output" (@id ws KBaseRNASeq.cummerbund_output) :returns: instance of type "ResultsToReport" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: returnVal # BEGIN create_interactive_heatmap_de_genes_old fparams = heatmapParams # returnVal = "ttt" # Set up workspace client user_token = ctx["token"] workspace = fparams["workspace_name"] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params["token"] = user_token system_params["ws_url"] = self.__WS_URL system_params["logger"] = self.__LOGGER system_params["shock_url"] = self.__SHOCK_URL system_params["hs_url"] = self.__HS_URL system_params["scratch"] = self.__SCRATCH system_params["rscripts"] = self.__RSCRIPTS system_params["workspace"] = workspace # Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{"name": fparams["ws_cuffdiff_id"], "workspace": fparams["workspace_name"]}]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token ) # cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) # if (cuffdiff_dir is False): # return returnVal fparams["cuffdiff_dir"] = cuffdiff_dir fparams["infile"] = join(cuffdiff_dir, "gene_exp.diff") fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter") fparams["pairs"] = 1 fparams["logModetmp"] = 2 rparams = {} rparams["cuffdiff_dir"] = fparams["cuffdiff_dir"] rparams["outpng"] = join(system_params["scratch"], "heatmap.png") rparams["imageheight"] = 1600 rparams["imagewidth"] = 800 rparams["plotscript"] = join(system_params["rscripts"], "heatmapplotinteractive.R") rparams["include_replicates"] = 1 rparams["pairs"] = fparams["pairs"] rparams["logMode"] = fparams["logModetmp"] rparams["removezeroes"] = 1 rparams["outmatrix"] = join(system_params["scratch"], "outmatrix") reportObj = {} provenance = [{}] if "provenance" in ctx: provenance = ctx["provenance"] # add additional info to provenance here, in this case the input data object reference provenance[0]["input_ws_objects"] = [workspace + "/" + fparams["ws_cuffdiff_id"]] report = "" if fparams["pairs"] != 0: try: filtered_matrix = script_util2.filter_expression_matrix(fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams["infile"] = join(system_params["scratch"], "gene_exp.diff.filter") fparams["outfile"] = join(system_params["scratch"], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step(fparams) rparams["genelist"] = filtered_matrix except: report += "There was an error in creating expression matrix" report += "No differentially expressed genes were found" report += "Please change / double check your filtering criteria" reportObj = {"objects_created": [], "text_message": report} reportName = "create_interactive_heatmap_de_genes_old_" + str(hex(uuid.getnode())) report_info = ws_client.save_objects( { "workspace": fparams["workspace_name"], "objects": [ { "type": "KBaseReport.Report", "data": reportObj, "name": reportName, "meta": {}, "hidden": 1, # important! make sure the report is hidden "provenance": provenance, } ], } )[0] print ("saved Report: " + pformat(report_info)) returnVal = { "report_name": reportName, "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]), } return [returnVal] try: # Prepare output object. outjson = False roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic(rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [ { "roptstr": roptstr_basic_heatmap_rep, "title": "Heatmap", "description": "Heatmap", "exp": fparams["ws_expression_matrix_id"], } ] fparams["cummerbundplotset"] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams["title"] = plot["title"] fparams["description"] = plot["description"] status = script_util2.rplotanduploadinteractive(system_params, fparams, rparams, plot["roptstr"]) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status self.__LOGGER.info("xxxxxx1") with open("{0}/{1}".format(self.__SCRATCH, outjson), "r") as et2: eo2 = json.load(et2) genome_ref = s_res[0]["data"]["genome_id"] eo2["type"] = "log2_level" eo2["genome_ref"] = genome_ref self.__LOGGER.info("xxxxxx2") self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot["exp"]) res = ws_client.save_objects( { "workspace": workspace, "objects": [ {"type": "KBaseFeatureValues.ExpressionMatrix", "data": eo2, "name": plot["exp"]} ], } ) info = res[0] self.__LOGGER("done uploading exp") report = "Successfully created expression matrix" reportObj = { "objects_created": [ { "ref": str(info[6]) + "/" + str(info[0]) + "/" + str(info[4]), "description": "Expression matrix", } ], "text_message": report, } except: report += "There was an error in generating expression matrix" reportObj = {"objects_created": [], "text_message": report} reportName = "create_interactive_heatmap_de_genes_" + str(hex(uuid.getnode())) report_info = ws_client.save_objects( { "workspace": fparams["workspace_name"], "objects": [ { "type": "KBaseReport.Report", "data": reportObj, "name": reportName, "meta": {}, "hidden": 1, # important! make sure the report is hidden "provenance": provenance, } ], } )[0] print ("saved Report: " + pformat(report_info)) returnVal = { "report_name": reportName, "report_ref": str(report_info[6]) + "/" + str(report_info[0]) + "/" + str(report_info[4]), } # END create_interactive_heatmap_de_genes_old # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError( "Method create_interactive_heatmap_de_genes_old return value " + "returnVal is not type dict as required." ) # return the results return [returnVal]
def filter_genes(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN filter_genes try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol-1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: self.logger.error("One of p_value or num_features must be defined"); return empty_results("One of p_value or num_features must be defined", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## Header correction try: with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'w') as ff: ff.write(fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) except: self.logger.error("Output was not found"); return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws) ## checking genelist with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if(len(gl) < 1) : self.logger.error("No genes are selected") return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws) #sys.exit(4) ## Upload FVE # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing # Updates: change missing genome handling strategy by copying reference to working workspace cmd_upload_expr = [self.TSV_2_FVE, '--workspace_service_url', self.__WS_URL, '--object_name', param['out_expr_object_name'], '--working_directory', self.FINAL_DIR, '--input_directory', self.FLTRD_DIR, '--output_file_name', self.FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0] if len(obj_infos) < 1: self.logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref'])) #tmp_ws = "{0}".format(obj_infos[7]) self.logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1])) if obj_infos[7] != param['workspace_name']: #we need to copy it from the other workspace try: self.logger.info("trying to copy the referenced genome object : {0}".format(expr['genome_ref'])) ws.copy_object({'from' : {'ref' : expr['genome_ref']},'to' : {'workspace': param['workspace_name'], 'name' : obj_infos[1]}}) # add genome_object_name only after successful copy cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) except: # no permission or any issues... then, give up providing genome reference self.logger.info("".join(traceback.format_exc())) pass else: # it is local... we can simply add reference without copying genome cmd_upload_expr.append('--genome_object_name') cmd_upload_expr.append(obj_infos[1]) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) self.logger.info(" ".join(cmd_upload_expr)) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) with open("{0}/{1}".format(self.FINAL_DIR,self.FINAL_FN),'r') as et: eo = json.load(et) if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method']) if 'feature_mapping' in expr and 'feature_mapping' in eo: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]}) ## Upload FeatureSet fs ={'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method']) fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]}) result = {'workspace_name' : param['workspace_name'], 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']} #END filter_genes # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method filter_genes return value ' + 'result is not type dict as required.') # return the results return [result]
def generate_cummerbund_plots(self, ctx, cummerbundParams): # ctx is the context object # return variables are: returnVal #BEGIN generate_cummerbund_plots params = cummerbundParams returnVal = params['ws_cummerbund_output'] #Set up workspace client user_token = ctx['token'] ws_client = Workspace(url=self.__WS_URL, token=user_token) #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name' : params['ws_cuffdiff_id'], 'workspace' : params['workspace_name'] }]) # Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal # Get input data Shock Id and Filename. cuffdiff_shock_id = s_res[0]['data']['file']['id'] cuffdiff_file_name = s_res[0]['data']['file']['file_name'] #cuffdiff_file_name =None filesize = None # Download tar file dx = script_util.download_file_from_shock( self.__LOGGER, self.__SHOCK_URL, cuffdiff_shock_id, cuffdiff_file_name, self.__SCRATCH, filesize, user_token) #Decompress tar file and keep it in a directory tarfile = join(self.__SCRATCH, cuffdiff_file_name) dstnExtractFolder = join(self.__SCRATCH, "cuffdiffData") if not os.path.exists(dstnExtractFolder): os.makedirs(dstnExtractFolder) untarStatus = script_util2.untar_files(self.__LOGGER, tarfile, dstnExtractFolder) if untarStatus == False: self.__LOGGER.info("Problem extracting the archive") return returnVal foldersinExtractFolder = os.listdir(dstnExtractFolder) if len(foldersinExtractFolder) == 0: self.__LOGGER.info("Problem extracting the archive") return returnVal # Run R script to run cummerbund json and update the cummerbund output json file cuffdiff_dir = join(dstnExtractFolder, foldersinExtractFolder[0]) self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) # Prepare output object. outputobject=dict() # Prepare output plot list cummerbundplotset=[] # List of plots to generate plotlist = [ { 'file': "dispersionplot.R", 'title': "Dispersion plot", 'description': "Dispersion plot" }, { 'file': "pcaplot.R", 'title': "PCA plot", 'description': "PCA plot" }, { 'file': "fpkmscvplot.R", 'title': "FPKM SCV plot", 'description': "FPKM SCV plot" } ] # Iterate through the plotlist and generate the images and json files. for plot in plotlist: status = script_util2.rplotandupload(self.__LOGGER, self.__SCRATCH, self.__RSCRIPTS, plot['file'], self.__SHOCK_URL, self.__HS_URL, user_token, cummerbundplotset, plot['title'], plot['description'], cuffdiff_dir) if status == False: self.__LOGGER.info("Problem generating image and json file - " + plot["file"]) # Populate the output object outputobject['cummerbundplotSet'] = cummerbundplotset #TODO: Need to figure out how to get rnaseq experiment id outputobject['rnaseq_experiment_id'] = "rnaseq_experiment_id" outputobject['cuffdiff_input_id'] = params['ws_cuffdiff_id'] res = ws_client.save_objects({ "workspace":params['workspace_name'], "objects": [{ "type":"KBaseRNASeq.cummerbund_output", "data":outputobject, "name":params["ws_cummerbund_output"]}] }) #END generate_cummerbund_plots # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError('Method generate_cummerbund_plots return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def view_heatmap(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN view_heatmap try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Loading data") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) fc = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data'] if 'original_data' not in fc: raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix") oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0] df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids']) # cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, # '--workspace_name', oexpr['info'][7], # '--object_name', oexpr['info'][1], # '--working_directory', self.RAWEXPR_DIR, # '--output_file_name', self.EXPRESS_FN # ] # # # need shell in this case because the java code is depending on finding the KBase token in the environment # # -- copied from FVE_2_TSV # tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) # stdout, stderr = tool_process.communicate() # # if stdout is not None and len(stdout) > 0: # self.logger.info(stdout) # # if stderr is not None and len(stderr) > 0: # self.logger.info(stderr) # # df = pd.read_csv("{0}/{1}".format(self.RAWEXPR_DIR,self.EXPRESS_FN), sep='\t') # df2 = df[df.columns[1:]] # rn = df[df.columns[0]] # df2.index = rn # L2 normalization df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0) # type - ? level, ratio, log-ratio <---> "untransformed" # scale - ? probably: raw, ln, log2, log10 self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] )) if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass elif oexpr['data']['type'] == 'ratio': fc_cf = df2.apply(np.log2) elif oexpr['data']['type'] == 'log-ratio': fc_cf = df2 if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass else: # do the same thing with simple level or untransformed if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0": factor = 0.125 fc_df = df2 + df2[df2 !=0].abs().min().min() * factor if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2) else: fc_df = df2 if param['control_condition'] in fc_df.columns: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)) else: fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)) if oexpr['data']['scale'] == "log10": fc_df = fc_df/np.log10(2) elif oexpr['data']['scale'] == "ln": fc_df = fc_df/np.log(2) else: pass self.logger.info("Compute cluster statistics") cl = {} afs = []; cid = 1; c_stat = pd.DataFrame() for cluster in fc['feature_clusters']: try: fs = cluster['id_to_pos'].keys() except: continue # couldn't find feature_set fsn = "Cluster_{0}".format(cid) cid +=1 c_stat.loc[fsn,'size'] = len(fs) if 'meancor' in cluster: c_stat.loc[fsn,'mcor'] = cluster['meancor'] else: pass # TODO: Add mean cor calculation later #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN if 'quantile' in param: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(float(param['quantile'])) else: c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75) c1 = df3.loc[fs,].sum(axis=0) if df3.loc[fs,].shape[0] < 1: # empty continue cl[fsn] = fs #afs.extend(fs) #c1 = df3.loc[fs,].sum(axis=0) #c1 = c1 / np.sqrt(c1.pow(2).sum()) #if(len(cl.keys()) == 1): # centroids = c1.to_frame(fsn).T #else: # centroids.loc[fsn] = c1 # now we have centroids and statistics # let's subselect clusters min_features = 200 if 'min_features' in param : min_features = param['min_features'] c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max() c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max() if 'use_norm_weight' in param and param['use_norm_weight'] != 0: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0 * c_stat.loc[:,'nstdstat'] else: if 'quantile_weight' in param: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat'] else: c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1 * c_stat.loc[:,'stdstat'] c_stat.sort_values('weight', inplace=True, ascending=False) pprint(c_stat) centroids = pd.DataFrame() for i in range(c_stat.shape[0]): fsn = c_stat.index[i] fs = cl[fsn] if i != 0 and len(afs) + len(fs) > min_features : break; afs.extend(fs) c1 = df3.loc[fs,].sum(axis=0) c1 = c1 / np.sqrt(c1.pow(2).sum()) if(centroids.shape[0] < 1): centroids = c1.to_frame(fsn).T else: centroids.loc[fsn] = c1 pprint(centroids) if len(cl.keys()) == 0: raise Exception("No feature ids were mapped to dataset or no clusters were selected") # dataset centroid dc = df3.loc[afs,].sum(axis=0) dc = dc / np.sqrt(dc.pow(2).sum()) self.logger.info("Ordering Centroids and Data") # the most far away cluster centroid from dataset centroid fc = (centroids * dc).sum(axis=1).idxmin() # the most far away centroid centroid from fc ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin() # major direction to order on unit ball space md = centroids.loc[ffc,] - centroids.loc[fc,] # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all) corder = (centroids * md).sum(axis=1).sort_values() # cluster order coidx = corder.index dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order # get first fs table fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []} fig_properties['ygtick_labels'] = coidx.tolist() if 'fold_change' in param and param['fold_change'] == 1: frange = 2 if 'fold_change_range' in param: frange = float(param['fold_change_range']) final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) if 'fold_cutoff' in param and param['fold_cutoff'] == 1: final[final > frange] = frange final[final < - frange] = - frange else: fc_df0b = final.sub(final.min(axis=1), axis=0) final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange else: final=df2.loc[dorder.loc[cl[coidx[0]],].index,] fig_properties['ygroup'].append(final.shape[0]) for i in range(1,len(coidx)): tf = df2.loc[dorder.loc[cl[coidx[i]],].index,] fig_properties['ygroup'].append(tf.shape[0]) final = final.append(tf) ## loading pvalue distribution FDT fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; #fdt = OrderedDict(fdt) fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose fdt['row_labels'] = final.columns.tolist() fdt['column_labels'] = final.index.tolist() # TODO: Add group label later fdt['id'] = "{0}.fdt".format(param['out_figure_object_name']) self.logger.info("Saving the results") sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : fdt, 'name' : "{0}.fdt".format(param['out_figure_object_name'])}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END view_heatmap # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method view_heatmap return value ' + 'result is not type dict as required.') # return the results return [result]
def run_filter_genes(workspace_service_url=None, param_file=None, level=logging.INFO, logger=None): """ Narrative Job Wrapper script to execute coex_filter Args: workspace_service_url: A url for the KBase Workspace service param_file: parameter file object_name: Name of the object in the workspace level: Logging level, defaults to logging.INFO. Returns: Output is written back in WS Authors: Shinjae Yoo """ try: os.makedirs(RAWEXPR_DIR) except: pass try: os.makedirs(FLTRD_DIR) except: pass try: os.makedirs(FINAL_DIR) except: pass if logger is None: logger = script_utils.stderrlogger(__file__) logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = os.environ.get("KB_AUTH_TOKEN") with open(param_file) as paramh: param = json.load(paramh) cmd_dowload_cvt_tsv = [ FVE_2_TSV, '--workspace_service_url', workspace_service_url, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', RAWEXPR_DIR, '--output_file_name', EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [ COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y' ] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(param['num_features']) if 'num_features' not in param and 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(param['p_value']) if 'p_value' not in param and 'num_features' not in param: logger.error("One of p_value or num_features must be defined") sys.exit(2) #if 'p_value' in param and 'num_features' in param: # logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) ## Header correction with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff: fe = ff.readlines() with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff: ff.write( fl) # use original first line that has correct header information fe.pop(0) ff.writelines(fe) ## Upload FVE from biokbase.workspace.client import Workspace ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN']) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing cmd_upload_expr = [ TSV_2_FVE, '--workspace_service_url', workspace_service_url, '--object_name', param['out_expr_object_name'], '--working_directory', FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name', FINAL_FN ] tmp_ws = param['workspace_name'] if 'genome_ref' in expr: cmd_upload_expr.append('--genome_object_name') obj_infos = ws.get_object_info_new( {"objects": [{ 'ref': expr['genome_ref'] }]})[0] if len(obj_infos) < 1: logger.error("Couldn't find {0} from the workspace".format( expr['genome_ref'])) raise Exception("Couldn't find {0} from the workspace".format( expr['genome_ref'])) cmd_upload_expr.append(obj_infos[1]) tmp_ws = obj_infos[7] logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1])) # updated ws name cmd_upload_expr.append('--workspace_name') cmd_upload_expr.append(tmp_ws) tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: logger.info(stdout) if stderr is not None and len(stderr) > 0: logger.info(stderr) with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et: eo = json.load(et) if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format( expr['description'], " ".join(cmd_coex_filter)) if 'feature_mapping' in expr: expr['feature_mapping'] = eo['feature_mapping'] expr['data'] = eo['data'] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': expr, 'name': (param['out_expr_object_name']) }] }) ## Upload FeatureSet fs = { 'description': 'Differentially expressed genes generated by {0}'.format( " ".join(cmd_coex_filter)), 'elements': {} } with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseCollections.FeatureSet', 'data': fs, 'name': (param['out_fs_object_name']) }] })
def net_clust (args) : ### # download ws object and convert them to csv wsd = Workspace(url=args.ws_url, token=os.environ.get('KB_AUTH_TOKEN')) lseries = wsd.get_object({'id' : args.inobj_id, 'type' : 'KBaseExpression.ExpressionSeries', 'workspace' : args.ws_id})['data'] if lseries is None: raise COEXException("Object {} not found in workspace {}".format(args.inobj_id, args.ws_id)) samples, sids, genome_id = {}, [], "" # assume only one genome id for gid in sorted(lseries['genome_expression_sample_ids_map'].keys()): genome_id = gid for samid in lseries['genome_expression_sample_ids_map'][gid]: sids.append({'ref': samid}) samples = wsd.get_objects(sids) break cif = open(args.exp_fn, 'w') header = ",".join([s['data']['source_id'] for s in samples]) cif.write(header + "\n") gids = samples[0]['data']['expression_levels'].keys() # each sample has same gids for gid in sorted(gids): line = gid + "," line += ",".join([str(s['data']['expression_levels'][gid]) for s in samples]) cif.write(line + "\n") cif.close() ### # generate network and cluster net_cmd_lst = ['coex_net', '-i', args.exp_fn] if (args.nmethod is not None): net_cmd_lst.append("-m") net_cmd_lst.append(args.nmethod) if (args.cut_off is not None): net_cmd_lst.append("-c") net_cmd_lst.append(args.cut_off) if (args.net_fn is not None): net_cmd_lst.append("-o") net_cmd_lst.append(args.net_fn) p1 = Popen(net_cmd_lst, stdout=PIPE) out_str = p1.communicate() if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] net_cmd = " ".join(net_cmd_lst) clust_cmd_lst = ['coex_cluster2', '-i', args.exp_fn] if (args.cmethod is not None): clust_cmd_lst.append("-c") clust_cmd_lst.append(args.cmethod) if (args.nmethod is not None): clust_cmd_lst.append("-n") clust_cmd_lst.append(args.nmethod) if (args.k is not None): clust_cmd_lst.append("-s") clust_cmd_lst.append(args.k) if (args.clust_fn is not None): clust_cmd_lst.append("-o") clust_cmd_lst.append(args.clust_fn) p1 = Popen(clust_cmd_lst, stdout=PIPE) out_str = p1.communicate() if out_str[0] is not None : print out_str[0] if out_str[1] is not None : print >> sys.stderr, out_str[1] clust_cmd = " ".join(clust_cmd_lst) ### # Create network object #generate Networks datasets net_ds_id = args.inobj_id + ".net" clt_ds_id = args.inobj_id + ".clt" datasets = [ { 'network_type' : 'FUNCTIONAL_ASSOCIATION', 'taxons' : [ genome_id ], 'source_ref' : 'WORKSPACE', 'name' : net_ds_id, 'id' : clt_ds_id, 'description' : "Coexpression network object of " + args.inobj_id, 'properties' : { 'original_data_type' : 'workspace', 'original_ws_id' : args.ws_id, 'original_obj_id' : args.inobj_id, 'coex_net_cmd' : net_cmd } }, { 'network_type' : 'FUNCTIONAL_ASSOCIATION', 'taxons' : [ genome_id ], 'source_ref' : 'WORKSPACE', 'name' : clt_ds_id, 'id' : clt_ds_id, 'description' : "Coexpression cluster object of " + args.inobj_id, 'properties' : { 'original_data_type' : 'workspace', 'original_ws_id' : args.ws_id, 'original_obj_id' : args.inobj_id, 'coex_clust_cmd' : clust_cmd } } ] # process coex network file nc = Node() cnf = open(args.net_fn,'r'); cnf.readline(); # skip header for line in cnf : line.strip(); line = line.replace('"','') values = line.split(',') if values[0] != values[1] : nc.add_edge(float(values[2]), net_ds_id, values[0], 'GENE', values[1], 'GENE', 0.0) #we add edges meaningful # process coex cluster file cnf = open(args.clust_fn,'r') cnf.readline(); # skip header for line in cnf : line = line.strip(); line = line.replace('"','') values = line.split(',') nc.add_edge(1.0, clt_ds_id, values[0], 'GENE', "cluster." + values[1], 'CLUSTER', 0.0) # generate Networks object net_object = { 'datasets' : datasets, 'nodes' : nc.nodes, 'edges' : nc.edges, 'user_annotations' : {}, 'name' : 'Coexpression Network', 'id' : args.outobj_id, 'properties' : { 'graphType' : 'edu.uci.ics.jung.graph.SparseMultigraph' } } # Store results object into workspace wsd.save_objects({'workspace' : args.ws_id, 'objects' : [{'type' : 'KBaseNetworks.Network', 'data' : net_object, 'name' : args.outobj_id, 'meta' : {'org_obj_id' : args.inobj_id, 'org_ws_id' : args.ws_id}}]}) if(args.del_tmps is "true") : os.remove(args.exp_fn) os.remove(args.net_fn) os.remove(args.clust_fn)
def create_interactive_heatmap_de_genes(self, ctx, interactiveHeatmapParams): # ctx is the context object # return variables are: returnVal #BEGIN create_interactive_heatmap_de_genes fparams = interactiveHeatmapParams #returnVal = "ttt" #Set up workspace client user_token = ctx['token'] workspace = fparams['workspace_name'] ws_client = Workspace(url=self.__WS_URL, token=user_token) system_params = {} system_params['token'] = user_token system_params['ws_url'] = self.__WS_URL system_params['logger'] = self.__LOGGER system_params['shock_url'] = self.__SHOCK_URL system_params['hs_url'] = self.__HS_URL system_params['scratch'] = self.__SCRATCH system_params['rscripts'] = self.__RSCRIPTS system_params['workspace'] = workspace #Read the input cuffdiff workspace object json file and get filehandle for cuffdiff tar file s_res = ws_client.get_objects([{ 'name': fparams['ws_cuffdiff_id'], 'workspace': fparams['workspace_name'] }]) #Check if workspace has data if len(s_res) == 0: self.__LOGGER.info("Workspace did not return any objects") return returnVal cuffdiff_dir = join(self.__SCRATCH, "cuffdiffData/cuffdiff") cuffdiff_dir = script_util2.extract_cuffdiff_data( self.__LOGGER, self.__SHOCK_URL, self.__SCRATCH, s_res, user_token) #cuffdiff_dir = "/kb/module/work/nnc/cuffdiff" self.__LOGGER.info("Cuffdiff folder = " + cuffdiff_dir) #if (cuffdiff_dir is False): # return returnVal fparams['cuffdiff_dir'] = cuffdiff_dir fparams['infile'] = join(cuffdiff_dir, "gene_exp.diff") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter") filtered_matrix = script_util2.filter_expression_matrix( fparams, system_params) self.__LOGGER.info("matrix is " + filtered_matrix) fparams['infile'] = join(system_params['scratch'], "gene_exp.diff.filter") fparams['outfile'] = join(system_params['scratch'], "gene_exp.diff.filter.genelist") genelist_filtered_matrix_file = script_util2.get_gene_list_from_filter_step( fparams) # Prepare output object. outjson = False rparams = {} rparams['genelist'] = filtered_matrix rparams['cuffdiff_dir'] = fparams['cuffdiff_dir'] rparams['outpng'] = join(system_params['scratch'], "heatmap.png") rparams['imageheight'] = 1600 rparams['imagewidth'] = 800 rparams['plotscript'] = join(system_params['rscripts'], "heatmapplotinteractive.R") rparams['include_replicates'] = 1 rparams['outmatrix'] = join(system_params['scratch'], "outmatrix") roptstr_basic_heatmap_rep = script_util2.get_command_line_heatmap_basic( rparams) # Run R script to run cummerbund json and update the cummerbund output json file # Prepare output object. outputobject = dict() # Prepare output plot list cummerbundplotset = [] # List of plots to generate plotlist = [{ 'roptstr': roptstr_basic_heatmap_rep, 'title': "Heatmap", 'description': "Heatmap", 'exp': fparams['ws_expression_matrix_id'] }] fparams['cummerbundplotset'] = cummerbundplotset # Iterate through the plotlist and generate the images and json files. for plot in plotlist: fparams['title'] = plot['title'] fparams['description'] = plot['description'] status = script_util2.rplotanduploadinteractive( system_params, fparams, rparams, plot['roptstr']) if status == False: self.__LOGGER.info( "Problem generating image and json file - " + plot["roptstr"]) else: self.__LOGGER.info(status) outjson = status with open("{0}/{1}".format(self.__SCRATCH, outjson), 'r') as et2: eo2 = json.load(et2) genome_ref = s_res[0]['data']['genome_id'] eo2['type'] = 'untransformed' #eo2['genome_ref'] = genome_ref self.__LOGGER.info(workspace + self.__SCRATCH + outjson + plot['exp']) ws_client.save_objects({ 'workspace': workspace, 'objects': [{ 'type': 'KBaseFeatureValues.ExpressionMatrix', 'data': eo2, 'name': plot['exp'] }] }) returnVal = fparams['ws_expression_matrix_id'] #END create_interactive_heatmap_de_genes # At some point might do deeper type checking... if not isinstance(returnVal, basestring): raise ValueError( 'Method create_interactive_heatmap_de_genes return value ' + 'returnVal is not type basestring as required.') # return the results return [returnVal]
def run_Coveringarray(self, ctx, params): """ This example function accepts any number of parameters and returns results in a KBaseReport :param params: instance of mapping from String to unspecified object :returns: instance of type "ReportResults" -> structure: parameter "report_name" of String, parameter "report_ref" of String """ # ctx is the context object # return variables are: output #BEGIN run_Coveringarray # for each 'container_object' iterate for each option, # sum object options and number of objects to create strength, and factors numbers strength = 2 valueList = [] nameList = {} sampleSize = 0 # turn namelist into a dict, assign name with value of len(opt2) at assignment time # [params] # [container_object] # [variable length x1,x2,xn] # [name] # [values] # [variable length x1,x2,xn] # container_object is a list due to 'allow multiple' = true # each entry in container_object list has its own grouping of settings 1,2,3 # all entry forms are free form text boxes associated with a known id # of measuring strenth and factors through volume of known id, # and pairs through combinations of known id, # user input never is used in the program to keep track of order # after coveringarray output is obtained, the container object list is used # to swap id with text form entries. # strength = params["strength"] strength = int(params['option_0']) #try catch for lack in media object failure if params['input_media'] == "" or params['input_media'] is None: # flake8 change try: for setting in params['container_object']: if setting['option_1'] != "": nameList[setting['option_1']] = len(setting['option_2']) for option in setting['option_2']: valueList.append(option) except: print("Failed to read in non-media input") # each params["container_object"][x] is a has a list with a name # and another list of strings else: #try catch for media object retreival failure try: medianame = params['workspace_name']+"/"+str(params['input_media']) media = self.dfu.get_objects({'object_refs': [medianame]})['data'][0]['data'] # print('\n\n ======' + str(media.items()) + '=======\n\n') # for modnames in params['container_object'] # if modnames['option_0'] == compound['name'] # compo print(media['id']) mediaComps = media.get("mediacompounds") # print('\n\n ======' + str(mediaComps.items()) + '=======\n\n') crefMatch = 0 print("\n\n==cref match init"+"==\n\n") except: print("Media read in failure") try: if params['evaluation_options'] == 'append_media': print("\n\n== Append Element Mode ==\n\n") for compound in mediaComps: cref = compound['compound_ref'].split("/")[-1] nameList[cref] = 2 valueList.append(compound['maxFlux']) valueList.append(-100) for setting in params['container_object']: if setting['option_1'] != "": nameList[setting['option_1']] = len(setting['option_2']) for option in setting['option_2']: valueList.append(option) except: print("Append media option failure") try: if params['evaluation_options'] == 'overwrite_media': ow = 0 print("\n\n== Overwrite Media Elements Mode ==\n\n") for compound in mediaComps: ow = 0 cref = compound['compound_ref'].split("/")[-1] for setting in params['container_object']: if cref == setting['option_1']: ow = 1 nameList[cref] = len(setting['option_2']) for value in setting['option_2']: valueList.append(value) if ow == 0: nameList[cref] = 2 valueList.append(compound['maxFlux']) valueList.append(-100) except: print("Overwrite media option failure") try: if params['evaluation_options'] == 'isolate_media': print("\n\n== Isolate Media Elements Mode ==\n\n") for compound in mediaComps: cref = compound['compound_ref'].split("/")[-1] for setting in params['container_object']: if cref == setting['option_1']: nameList[cref] = 2 valueList.append(compound['maxFlux']) valueList.append(-100) except: print("Isolate media option failure") sampleSize = len(nameList) print("\n\n== samplesize adjusted to " + str(sampleSize) + " ==\n\n") formattedParams = str(strength) + '\n' + str(sampleSize) + '\n' for name in nameList: formattedParams += str(nameList[name]) + ' 1\n' inputfile = open("inputfile.txt", 'w') inputfile.write(formattedParams) inputfile.close() inputfile = open("inputfile.txt", 'r') print("\n\n============== Formatted Input Begin ===============\n\n") for line in inputfile: print(line) inputfile.close() print("\n\n============== Formatted Input End ===============\n\n") try: os.system('/kb/module/./cover inputfile.txt -F') outputfile = open("anneal.out", 'r') rawout = " " for line in outputfile: rawout += line outputfile.close() outputfile = open("anneal.out", 'r') except: print("Wrapped cover tool failure") finaloutputText = " " trimmedOutFile = "" #if json out do this elif media out do that else matrixData = { "row_ids":[], "column_ids":[], "row_labels":['combinations'], "column_labels":['compounds'], "row_groups_ids":['1'], "column_groups_ids":['1'], "data":[[]] } for name in nameList: finaloutputText += name finaloutputText += " " matrixData["column_ids"].append(name) finaloutputText += "\n ==================== \n" # count by line instead, look for empty line followed by length 1 line to start matrixReadFlag = 0 outPutLead = 0 n=1 for line in outputfile: if outPutLead != 0 and matrixReadFlag == 10: matrixData["row_ids"].append('row'+str(n)) n+=1 for c in line.split(): if len(line) > 2 and c != str(outPutLead): finaloutputText += str(valueList[int(c)]) finaloutputText += "," trimmedOutFile += str(valueList[int(c)]) trimmedOutFile += "," else: finaloutputText += c finaloutputText += "," trimmedOutFile += c trimmedOutFile += "," finaloutputText = finaloutputText[:-1] finaloutputText += "\n" if matrixReadFlag == 3: outPutLead = line print(outPutLead) print("\n\n" + line + "\n\n") finaloutputText += "Sample Size: " + outPutLead + " \n" matrixReadFlag = 10 if(line == "\n" and len(line) == 1): matrixReadFlag += 1 matrixData["data"]=[[] for i in range(len(matrixData["row_ids"]))] listversion = [n.strip() for n in trimmedOutFile.split(',')] for row in range(len(matrixData["row_ids"])): for column in range(len(matrixData["column_ids"])): matrixData["data"][row].append(listversion[column+(row)*len(matrixData["column_ids"])]) if params['evaluation_options'] == 'isolate_media': unchangedmedialist = [] for compound in mediaComps: cref = compound['compound_ref'].split("/")[-1] if cref not in matrixData['column_ids']: unchangedmedialist.append([cref,compound['maxFlux']]) for item in unchangedmedialist: matrixData['column_ids'].append(item[0]) for row in matrixData["data"]: row.append(item[1]) #replace finaloutput text script with sourcing from matrixdata print("\n\n\n FINAL OUTPUT\n" + finaloutputText + "\nFINAL OUTPUT \n\n\n" + rawout) if params['output_media'] is not None or params['output_json_check'] == 1: workspaceClient = Workspace(self.workspaceURL,token = ctx['token']) #try catch for json object creation try: matrixObject = workspaceClient.save_objects({'workspace': params['workspace_name'], 'objects': [{'name':params['output_media'], 'type':'MAK.StringDataTable', 'data': matrixData}] }) except: print("JSON out object creation") test_media = { 'mediacompounds':[{'compound_ref':'testref1','concentration':100,'minFlux':0,'maxFlux':0},{'compound_ref':'testref2','concentration':100,'minFlux':100,'maxFlux':100}], 'isMinimal':0, 'isDefined':0, 'type':'Undefined', 'name':'testname', 'id':'testid' } # def __copy__(self): # return MediaCompound(self.compound_ref,self.concentration,self.minFlux,self.maxFlux) #def __deepcopy__(self,memo): # return MediaCompound(copy.deepcopy(self.compound_ref,self.concentration,self.minFlux,self.maxFlux,memo)) ##IDEAS 10/28/21: give a default value for compound_reference. ##remove deepcopys and pass reference to preserve original object #call workspace save on each piece before assembling def make_compound(compound_ref,concentration,minFlux,maxFlux): mediaCompound = { 'compound_ref': "489/6/8/"+"compounds/"+"id/"+compound_ref, ##KBaseBiochem.Biochemistry.compounds.*.id 'concentration':concentration, #first SECTION IS WORKSPACE NAME KBASEBIOCHEM -> WORKSPACE NAME I think it uses workspaceclient getobjects2 in order to fetch, check getobjects2 api! 'minFlux':minFlux, 'maxFlux':maxFlux } #potential reason: reference cpdxxx unrecognized error: media obect meta data shows "null" for extracted ids field and with no data bout the compounds return mediaCompound #CAUSES: refernce data/ointers lost in the media creation process: solution, more deepcopies if params['output_media'] is not None and params['output_media_check'] == 1: media_compounds_data = [] media_data = {} media_data_list = [] for index1, case in enumerate(matrixData['data']): media_compounds_data = []##BELOW ISSUE: On CDG TESTS, sizes go from 20 -> 9 on 3 compound isolations, why are ~50% of reactions changing to sub 0? for index2, compound in enumerate(case): ###BELOW: Maybe? Is object creation tied to the test suite? I Dont remember... if float(compound) > 0: ##Compound filtering for trimmed makeups will conflict with test suite expected outcome of Coveing Array Tool media_compound = make_compound(matrixData['column_ids'][index2],.001,-100,float(compound)) media_compounds_data.append(copy.deepcopy(media_compound)) media_data = { 'mediacompounds':copy.deepcopy(media_compounds_data), 'isMinimal':0, 'isDefined':0, 'type':'Undefined', 'name':params['output_media']+str(index1), 'id':params['output_media']+str(index1), 'sourceid':params['output_media']+str(index1) } media_data_list.append(copy.deepcopy(media_data)) for index,media in enumerate(media_data_list): try: workspaceClient.save_objects({'workspace': params['workspace_name'], 'objects': [{'name':media['name'], 'type':'KBaseBiochem.Media', 'data': media}] }) except: print("\n\n ERROR TRACE: \n\n" + traceback.format_exc()+'\n\n') print("KbaseBioChem.Media object out object creation failure") print("Media " + str(media['name']) + "Keys:\n" + str(media.keys())+'\n') print("Media " + str(media['name']) + "Values:\n" +'\n') for x,value in enumerate(media['mediacompounds']): print("Media compound "+ str(x) + ": "+ str(media['mediacompounds'][x]) +"\n") print("Other media properties"+str(media['isMinimal'])+ ' '+ str(media['isDefined'])+' ' + str(media['type'])+' ' + media['name']+' ' + media['media_id'])
def diff_p_distribution(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN diff_p_distribution try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## loading pvalue distribution FDT pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]}; pvfdt = OrderedDict(pvfdt) with open(self.PVFDT_FN, 'r') as myfile: pvfdt = json.load(myfile) data_obj_name = "{0}.fdt".format(param['out_figure_object_name']) pvfdt['id'] = data_obj_name fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"} sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable', 'data' : pvfdt, 'name' : data_obj_name}]}) data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4]) fig_properties['data_ref'] = data_ref sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties', 'data' : fig_properties, 'name' : (param['out_figure_object_name'])}]}) result = fig_properties #END diff_p_distribution # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method diff_p_distribution return value ' + 'result is not type dict as required.') # return the results return [result]
def filter_genes(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN filter_genes try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.FLTRD_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']] from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # force to use ANOVA if the number of sample is two if(ncol == 3): param['method'] = 'anova' with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_filter cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y'] if 'num_features' in param: cmd_coex_filter.append("-n") cmd_coex_filter.append(str(param['num_features'])) if 'p_value' in param: cmd_coex_filter.append("-p") cmd_coex_filter.append(str(param['p_value'])) if 'p_value' not in param and 'num_features' not in param: self.logger.error("One of p_value or num_features must be defined"); return error_report("One of p_value or num_features must be defined", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) ## checking genelist with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh: gl = glh.readlines() gl = [x.strip('\n') for x in gl] if(len(gl) < 1) : self.logger.error("No genes are selected") return error_report("Increase p_value or specify num_features", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(4) ## Upload FVE if 'description' not in expr: expr['description'] = "Filtered Expression Matrix" expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method']) expr = self._subselectExp(expr, gl) ex_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix', 'data' : expr, 'name' : (param['out_expr_object_name'])}]})[0] ## Upload FeatureSet fs ={'elements': {}} fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method']) fs['description'] += "from {0}/{1}".format(workspace_name, param['object_name']) for g in gl: if 'genome_ref' in expr: fs['elements'][g] = [expr['genome_ref']] else: fs['elements'][g] = [] fs_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseCollections.FeatureSet', 'data' : fs, 'name' : (param['out_fs_object_name'])}]})[0] ## Create report object: report = "Filtering expression matrix using {0} on {1}".format(param['method'],param['object_name']) reportObj = { 'objects_created':[{ 'ref':"{0}/{1}/{2}".format(fs_info[6], fs_info[0], fs_info[4]), 'description':'Filtered FeatureSet' }, { 'ref':"{0}/{1}/{2}".format(ex_info[6], ex_info[0], ex_info[4]), 'description':'Filetered ExpressionMatrix' }], 'text_message':report } # generate a unique name for the Method report reportName = 'FilterExpression_'+str(hex(uuid.getnode())) report_info = ws.save_objects({ 'id':ex_info[6], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) } #result = {'workspace_name' : workspace_name, 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']} #END filter_genes # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method filter_genes return value ' + 'result is not type dict as required.') # return the results return [result]
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV") token = ctx['token'] param = args auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL) user_id = auth_client.get_user(token) workspace_name_t = Template(param['workspace_name']) workspace_name = workspace_name_t.substitute(user_id=user_id) provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']] from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN) self.logger.info("Identifying differentially expressed genes") ## Prepare sample file # detect num of columns ncol = len(expr['data']['col_ids']) # grouping information with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1,ncol): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ] for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))} # parse clustering results cid2genelist = {} cid2stat = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh: glh.readline() # skip header for line in glh: cluster, mcor, msec = line.rstrip().replace('"','').split("\t") cid2stat[cluster]= [mcor, msec] with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.rstrip().replace('"','').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if(len(cid2genelist) < 1) : self.logger.error("Clustering failed") return error_report("Error: No cluster output", expr,self.__WS_URL, workspace_name, provenance, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}}) ## Upload Clusters feature_clusters ={"original_data": "{0}/{1}".format(workspace_name,param['object_name']), "feature_clusters": feature_clusters} cl_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters', 'data' : feature_clusters, 'name' : (param['out_object_name'])}]})[0] ## Create report object: report = "Clustering expression matrix using WGCNA on {0}".format(param['object_name']) reportObj = { 'objects_created':[ { 'ref':"{0}/{1}/{2}".format(cl_info[6], cl_info[0], cl_info[4]), 'description':'WGCNA FeatureClusters' }], 'text_message':report } # generate a unique name for the Method report reportName = 'WGCNA_Clusters_'+str(hex(uuid.getnode())) report_info = ws.save_objects({ 'id':cl_info[6], 'objects':[ { 'type':'KBaseReport.Report', 'data':reportObj, 'name':reportName, 'meta':{}, 'hidden':1, 'provenance':provenance } ] })[0] result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) } #result = {'workspace_name' : workspace_name, 'out_object_name' : param['out_object_name']} #result = {'workspace' : workspace_name, 'output' : param['out_object_name']} #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]
def CreateRNASeqSampleSet(self, ctx, params): """ :param params: instance of type "CreateRNASeqSampleSetParams" (FUNCTIONS used in the service) -> structure: parameter "ws_id" of String, parameter "sampleset_id" of String, parameter "sampleset_desc" of String, parameter "domain" of String, parameter "platform" of String, parameter "sample_ids" of list of String, parameter "condition" of list of String, parameter "source" of String, parameter "Library_type" of String, parameter "publication_id" of String, parameter "external_source_date" of String :returns: instance of type "RNASeqSampleSet" (Object to Describe the RNASeq SampleSet @optional platform num_replicates source publication_Id external_source_date sample_ids @metadata ws sampleset_id @metadata ws platform @metadata ws num_samples @metadata ws num_replicates @metadata ws length(condition)) -> structure: parameter "sampleset_id" of String, parameter "sampleset_desc" of String, parameter "domain" of String, parameter "platform" of String, parameter "num_samples" of Long, parameter "num_replicates" of Long, parameter "sample_ids" of list of String, parameter "condition" of list of String, parameter "source" of String, parameter "Library_type" of String, parameter "publication_Id" of String, parameter "external_source_date" of String """ # ctx is the context object # return variables are: returnVal #BEGIN CreateRNASeqSampleSet user_token=ctx['token'] ws_client=Workspace(url=self.__WS_URL, token=user_token) hs = HandleService(url=self.__HS_URL, token=user_token) try: ### Create the working dir for the method; change it to a function call out_obj = { k:v for k,v in params.iteritems() if not k in ('ws_id')} sample_ids = params["sample_ids"] out_obj['num_samples'] = len(sample_ids) ## Validation to check if the Set contains more than one samples if len(sample_ids) < 2: raise ValueError("This methods can only take 2 or more RNASeq Samples. If you have only one read sample, run either 'Align Reads using Tophat/Bowtie2' methods directly for getting alignment") ## Validation to Check if the number of samples is equal to number of condition if len(params["condition"]) != out_obj['num_samples']: raise ValueError("Please specify a treatment label for each sample in the RNA-seq SampleSet. Please enter the same label for the replicates in a sample type") ## Validation to Check if the user is loading the same type as specified above if params["Library_type"] == 'PairedEnd' : lib_type = 'KBaseAssembly.PairedEndLibrary' else: lib_type = 'KBaseAssembly.SingleEndLibrary' for i in sample_ids: s_info = ws_client.get_object_info_new({"objects": [{'name': i, 'workspace': params['ws_id']}]}) obj_type = s_info[0][2].split('-')[0] if obj_type != lib_type: raise ValueError("Library_type mentioned : {0}. Please add only {1} typed objects in Reads fields".format(lib_type,lib_type)) ## Code to Update the Provenance; make it a function later provenance = [{}] if 'provenance' in ctx: provenance = ctx['provenance'] #add additional info to provenance here, in this case the input data object reference provenance[0]['input_ws_objects']=[ params['ws_id']+'/'+sample for sample in sample_ids] #Saving RNASeqSampleSet to Workspace self.__LOGGER.info("Saving {0} object to workspace".format(params['sampleset_id'])) res= ws_client.save_objects( {"workspace":params['ws_id'], "objects": [{ "type":"KBaseRNASeq.RNASeqSampleSet", "data":out_obj, "name":out_obj['sampleset_id'], "provenance": provenance}] }) returnVal = out_obj except Exception,e: raise KBaseRNASeqException("Error Saving the object to workspace {0},{1}".format(out_obj['sampleset_id'],"".join(traceback.format_exc())))
def const_coex_net_clust(self, ctx, args): # ctx is the context object # return variables are: result #BEGIN const_coex_net_clust try: os.makedirs(self.RAWEXPR_DIR) except: pass try: os.makedirs(self.CLSTR_DIR) except: pass try: os.makedirs(self.FINAL_DIR) except: pass if self.logger is None: self.logger = script_utils.stderrlogger(__file__) result = {} self.logger.info( "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV" ) token = ctx['token'] param = args from biokbase.workspace.client import Workspace ws = Workspace(url=self.__WS_URL, token=token) expr = ws.get_objects([{ 'workspace': param['workspace_name'], 'name': param['object_name'] }])[0]['data'] eenv = os.environ.copy() eenv['KB_AUTH_TOKEN'] = token cmd_dowload_cvt_tsv = [ self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, '--workspace_name', param['workspace_name'], '--object_name', param['object_name'], '--working_directory', self.RAWEXPR_DIR, '--output_file_name', self.EXPRESS_FN ] # need shell in this case because the java code is depending on finding the KBase token in the environment # -- copied from FVE_2_TSV tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: self.logger.info(stderr) #raise Exception(stderr) self.logger.info("Coexpression clustering analysis") ## Prepare sample file # detect num of columns with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f: fl = f.readline() ncol = len(fl.split('\t')) with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s: s.write("0") for j in range(1, ncol - 1): s.write("\t{0}".format(j)) s.write("\n") ## Run coex_cluster cmd_coex_cluster = [ self.COEX_CLUSTER, '-t', 'y', '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN) ] for p in [ 'net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight' ]: if p in param: cmd_coex_cluster.append("--{0}".format(p)) cmd_coex_cluster.append(str(param[p])) #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination #if 'p_value' in param and 'num_features' in param: # self.logger.error("Both of p_value and num_features cannot be defined together"); # sys.exit(3) tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if stdout is not None and len(stdout) > 0: self.logger.info(stdout) if stderr is not None and len(stderr) > 0: if re.search( r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr): self.logger.info(stderr) else: self.logger.error(stderr) raise Exception(stderr) # build index for gene list pos_index = { expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids'])) } # parse clustering results cid2genelist = {} with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), 'r') as glh: glh.readline() # skip header for line in glh: gene, cluster = line.replace('"', '').split("\t") if cluster not in cid2genelist: cid2genelist[cluster] = [] cid2genelist[cluster].append(gene) if (len(cid2genelist) < 1): self.logger.error("Clustering failed") return empty_results("Error: No cluster output", expr, self.__WS_URL, param, self.logger, ws) #sys.exit(4) self.logger.info("Uploading the results onto WS") feature_clusters = [] for cluster in cid2genelist: feature_clusters.append({ "id_to_pos": {gene: pos_index[gene] for gene in cid2genelist[cluster]} }) ## Upload Clusters feature_clusters = { "original_data": "{0}/{1}".format(param['workspace_name'], param['object_name']), "feature_clusters": feature_clusters } ws.save_objects({ 'workspace': param['workspace_name'], 'objects': [{ 'type': 'KBaseFeatureValues.FeatureClusters', 'data': feature_clusters, 'name': (param['out_object_name']) }] }) result = { 'workspace_name': param['workspace_name'], 'out_object_name': param['out_object_name'] } #END const_coex_net_clust # At some point might do deeper type checking... if not isinstance(result, dict): raise ValueError('Method const_coex_net_clust return value ' + 'result is not type dict as required.') # return the results return [result]