class NarrativeManager: KB_CELL = 'kb-cell' KB_TYPE = 'type' KB_APP_CELL = 'kb_app' KB_FUNCTION_CELL = 'function_input' KB_OUTPUT_CELL = 'function_output' KB_ERROR_CELL = 'kb_error' KB_CODE_CELL = 'kb_code' KB_STATE = 'widget_state' DEBUG = False DATA_PALETTES_TYPES = DataPaletteTypes(False) def __init__(self, config, ctx, set_api_cache, dps_cache): self.narrativeMethodStoreURL = config['narrative-method-store'] self.set_api_cache = set_api_cache # DynamicServiceCache type self.dps_cache = dps_cache # DynamicServiceCache type self.token = ctx["token"] self.user_id = ctx["user_id"] self.ws = Workspace(config['workspace-url'], token=self.token) self.intro_md_file = config['intro-markdown-file'] # We switch DPs on only for internal Continuous Integration environment for now: if config['kbase-endpoint'].startswith("https://ci.kbase.us/"): self.DATA_PALETTES_TYPES = DataPaletteTypes(True) def list_objects_with_sets(self, ws_id=None, ws_name=None, workspaces=None, types=None, include_metadata=0): if not workspaces: if (not ws_id) and (not ws_name): raise ValueError( "One and only one of 'ws_id', 'ws_name', 'workspaces' " + "parameters should be set") workspaces = [self._get_workspace_name_or_id(ws_id, ws_name)] return self._list_objects_with_sets(workspaces, types, include_metadata) def _list_objects_with_sets(self, workspaces, types, include_metadata): type_map = None if types is not None: type_map = {key: True for key in types} processed_refs = {} data = [] if self.DEBUG: print("NarrativeManager._list_objects_with_sets: processing sets") t1 = time.time() set_ret = self.set_api_cache.call_method( "list_sets", [{ 'workspaces': workspaces, 'include_set_item_info': 1, 'include_raw_data_palettes': 1, 'include_metadata': include_metadata }], self.token) sets = set_ret['sets'] dp_data = set_ret.get('raw_data_palettes') dp_refs = set_ret.get('raw_data_palette_refs') for set_info in sets: # Process target_set_items = [] for set_item in set_info['items']: target_set_items.append(set_item['info']) if self._check_info_type(set_info['info'], type_map): data_item = { 'object_info': set_info['info'], 'set_items': { 'set_items_info': target_set_items } } data.append(data_item) processed_refs[set_info['ref']] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t1) + ")") if self.DEBUG: print("NarrativeManager._list_objects_with_sets: loading ws_info") t2 = time.time() ws_info_list = [] #for ws in workspaces: if len(workspaces) == 1: ws = workspaces[0] ws_id = None ws_name = None if str(ws).isdigit(): ws_id = int(ws) else: ws_name = str(ws) ws_info_list.append( self.ws.get_workspace_info({ "id": ws_id, "workspace": ws_name })) else: ws_map = {key: True for key in workspaces} for ws_info in self.ws.list_workspace_info({'perm': 'r'}): if ws_info[1] in ws_map or str(ws_info[0]) in ws_map: ws_info_list.append(ws_info) if self.DEBUG: print(" (time=" + str(time.time() - t2) + ")") if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: loading workspace objects" ) t3 = time.time() for info in WorkspaceListObjectsIterator( self.ws, ws_info_list=ws_info_list, list_objects_params={'includeMetadata': include_metadata}): item_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) if item_ref not in processed_refs and self._check_info_type( info, type_map): data_item = {'object_info': info} data.append(data_item) processed_refs[item_ref] = data_item if self.DEBUG: print(" (time=" + str(time.time() - t3) + ")") if self.DEBUG: print( "NarrativeManager._list_objects_with_sets: processing DataPalettes" ) t5 = time.time() if dp_data is None or dp_refs is None: dps = self.dps_cache dp_ret = dps.call_method("list_data", [{ 'workspaces': workspaces, 'include_metadata': include_metadata }], self.token) dp_data = dp_ret['data'] dp_refs = dp_ret['data_palette_refs'] for item in dp_data: ref = item['ref'] if self._check_info_type(item['info'], type_map): data_item = None if ref in processed_refs: data_item = processed_refs[ref] else: data_item = {'object_info': item['info']} processed_refs[ref] = data_item data.append(data_item) dp_info = {} if 'dp_ref' in item: dp_info['ref'] = item['dp_ref'] if 'dp_refs' in item: dp_info['refs'] = item['dp_refs'] data_item['dp_info'] = dp_info if self.DEBUG: print(" (time=" + str(time.time() - t5) + ")") return {"data": data, 'data_palette_refs': dp_refs} def _check_info_type(self, info, type_map): if type_map is None: return True obj_type = info[2].split('-')[0] return type_map.get(obj_type, False) def copy_narrative(self, newName, workspaceRef, workspaceId): time_ms = int(round(time.time() * 1000)) newWsName = self.user_id + ':narrative_' + str(time_ms) # add the 'narrative' field to newWsMeta later. newWsMeta = {"is_temporary": "false", "narrative_nice_name": newName} # start with getting the existing narrative object. currentNarrative = self.ws.get_objects([{'ref': workspaceRef}])[0] if not workspaceId: workspaceId = currentNarrative['info'][6] # Let's prepare exceptions for clone the workspace. # 1) currentNarrative object: excluded_list = [{'objid': currentNarrative['info'][0]}] # 2) let's exclude objects of types under DataPalette handling: data_palette_type = "DataPalette.DataPalette" excluded_types = [data_palette_type] excluded_types.extend(self.DATA_PALETTES_TYPES.keys()) add_to_palette_list = [] dp_detected = False for obj_type in excluded_types: list_objects_params = {'type': obj_type} if obj_type == data_palette_type: list_objects_params['showHidden'] = 1 for info in WorkspaceListObjectsIterator( self.ws, ws_id=workspaceId, list_objects_params=list_objects_params): if obj_type == data_palette_type: dp_detected = True else: add_to_palette_list.append({ 'ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]) }) excluded_list.append({'objid': info[0]}) # clone the workspace EXCEPT for currentNarrative object + obejcts of DataPalette types: newWsId = self.ws.clone_workspace({ 'wsi': { 'id': workspaceId }, 'workspace': newWsName, 'meta': newWsMeta, 'exclude': excluded_list })[0] try: if dp_detected: self.dps_cache.call_method( "copy_palette", [{ 'from_workspace': str(workspaceId), 'to_workspace': str(newWsId) }], self.token) if len(add_to_palette_list) > 0: # There are objects in source workspace that have type under DataPalette handling # but these objects are physically stored in source workspace rather that saved # in DataPalette object. So they weren't copied by "dps.copy_palette". self.dps_cache.call_method("add_to_palette", [{ 'workspace': str(newWsId), 'new_refs': add_to_palette_list }], self.token) # update the ref inside the narrative object and the new workspace metadata. newNarMetadata = currentNarrative['info'][10] newNarMetadata['name'] = newName newNarMetadata['ws_name'] = newWsName newNarMetadata['job_info'] = json.dumps({ 'queue_time': 0, 'running': 0, 'completed': 0, 'run_time': 0, 'error': 0 }) currentNarrative['data']['metadata']['name'] = newName currentNarrative['data']['metadata']['ws_name'] = newWsName currentNarrative['data']['metadata']['job_ids'] = { 'apps': [], 'methods': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } } # save the shiny new Narrative so it's at version 1 newNarInfo = self.ws.save_objects({ 'id': newWsId, 'objects': [{ 'type': currentNarrative['info'][2], 'data': currentNarrative['data'], 'provenance': currentNarrative['provenance'], 'name': currentNarrative['info'][1], 'meta': newNarMetadata }] }) # now, just update the workspace metadata to point # to the new narrative object newNarId = newNarInfo[0][0] self.ws.alter_workspace_metadata({ 'wsi': { 'id': newWsId }, 'new': { 'narrative': str(newNarId) } }) return {'newWsId': newWsId, 'newNarId': newNarId} except: # let's delete copy of workspace so it's out of the way - it's broken self.ws.delete_workspace({'id': newWsId}) raise # continue raising previous exception def create_new_narrative(self, app, method, appparam, appData, markdown, copydata, importData, includeIntroCell): if app and method: raise ValueError( "Must provide no more than one of the app or method params") if (not importData) and copydata: importData = copydata.split(';') if (not appData) and appparam: appData = [] for tmp_item in appparam.split(';'): tmp_tuple = tmp_item.split(',') step_pos = None if tmp_tuple[0]: try: step_pos = int(tmp_tuple[0]) except ValueError: pass appData.append([step_pos, tmp_tuple[1], tmp_tuple[2]]) cells = None if app: cells = [{"app": app}] elif method: cells = [{"method": method}] elif markdown: cells = [{"markdown": markdown}] return self._create_temp_narrative(cells, appData, importData, includeIntroCell) def _get_intro_markdown(self): """ Creates and returns a cell with the introductory text included. """ # Load introductory markdown text with open(self.intro_md_file) as intro_file: intro_md = intro_file.read() return intro_md def _create_temp_narrative(self, cells, parameters, importData, includeIntroCell): # Migration to python of JavaScript class from https://github.com/kbase/kbase-ui/blob/4d31151d13de0278765a69b2b09f3bcf0e832409/src/client/modules/plugins/narrativemanager/modules/narrativeManager.js#L414 narr_id = int(round(time.time() * 1000)) workspaceName = self.user_id + ':narrative_' + str(narr_id) narrativeName = "Narrative." + str(narr_id) ws = self.ws ws_info = ws.create_workspace({ 'workspace': workspaceName, 'description': '' }) newWorkspaceInfo = ServiceUtils.workspaceInfoToObject(ws_info) [narrativeObject, metadataExternal ] = self._fetchNarrativeObjects(workspaceName, cells, parameters, includeIntroCell) objectInfo = ws.save_objects({ 'workspace': workspaceName, 'objects': [{ 'type': 'KBaseNarrative.Narrative', 'data': narrativeObject, 'name': narrativeName, 'meta': metadataExternal, 'provenance': [{ 'script': 'NarrativeManager.py', 'description': 'Created new ' + 'Workspace/Narrative bundle.' }], 'hidden': 0 }] })[0] objectInfo = ServiceUtils.objectInfoToObject(objectInfo) self._completeNewNarrative(newWorkspaceInfo['id'], objectInfo['id'], importData) return {'workspaceInfo': newWorkspaceInfo, 'narrativeInfo': objectInfo} def _fetchNarrativeObjects(self, workspaceName, cells, parameters, includeIntroCell): if not cells: cells = [] # fetchSpecs appSpecIds = [] methodSpecIds = [] specMapping = {'apps': {}, 'methods': {}} for cell in cells: if 'app' in cell: appSpecIds.append(cell['app']) elif 'method' in cell: methodSpecIds.append(cell['method']) nms = NarrativeMethodStore(self.narrativeMethodStoreURL, token=self.token) if len(appSpecIds) > 0: appSpecs = nms.get_app_spec({'ids': appSpecIds}) for spec in appSpecs: spec_id = spec['info']['id'] specMapping['apps'][spec_id] = spec if len(methodSpecIds) > 0: methodSpecs = nms.get_method_spec({'ids': methodSpecIds}) for spec in methodSpecs: spec_id = spec['info']['id'] specMapping['methods'][spec_id] = spec # end of fetchSpecs metadata = { 'job_ids': { 'methods': [], 'apps': [], 'job_usage': { 'queue_time': 0, 'run_time': 0 } }, 'format': 'ipynb', 'creator': self.user_id, 'ws_name': workspaceName, 'name': 'Untitled', 'type': 'KBaseNarrative.Narrative', 'description': '', 'data_dependencies': [] } cellData = self._gatherCellData(cells, specMapping, parameters, includeIntroCell) narrativeObject = { 'nbformat_minor': 0, 'cells': cellData, 'metadata': metadata, 'nbformat': 4 } metadataExternal = {} for key in metadata: value = metadata[key] if isinstance(value, basestring): metadataExternal[key] = value else: metadataExternal[key] = json.dumps(value) return [narrativeObject, metadataExternal] def _gatherCellData(self, cells, specMapping, parameters, includeIntroCell): cell_data = [] if includeIntroCell == 1: cell_data.append({ 'cell_type': 'markdown', 'source': self._get_intro_markdown(), 'metadata': {} }) for cell_pos, cell in enumerate(cells): if 'app' in cell: cell_data.append( self._buildAppCell(len(cell_data), specMapping['apps'][cell['app']], parameters)) elif 'method' in cell: cell_data.append( self._buildMethodCell( len(cell_data), specMapping['methods'][cell['method']], parameters)) elif 'markdown' in cell: cell_data.append({ 'cell_type': 'markdown', 'source': cell['markdown'], 'metadata': {} }) else: raise ValueError("cannot add cell #" + str(cell_pos) + ", unrecognized cell content") return cell_data def _buildAppCell(self, pos, spec, params): cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4()) cell = { 'cell_type': 'markdown', 'source': "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeAppCell({'appSpec' : '" + self._safeJSONStringify(spec) + "', 'cellId' : '" + cellId + "'});" + "</script>", 'metadata': {} } cellInfo = {} widgetState = [] cellInfo[self.KB_TYPE] = self.KB_APP_CELL cellInfo['app'] = spec if params: steps = {} for param in params: stepid = 'step_' + str(param[0]) if stepid not in steps: steps[stepid] = {} steps[stepid]['inputState'] = {} steps[stepid]['inputState'][param[1]] = param[2] state = { 'state': { 'step': steps } } widgetState.append(state) cellInfo[self.KB_STATE] = widgetState cell['metadata'][self.KB_CELL] = cellInfo return cell def _buildMethodCell(self, pos, spec, params): cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4()) cell = { 'cell_type': 'markdown', 'source': "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" + cellId + "').kbaseNarrativeMethodCell({'method' : '" + self._safeJSONStringify(spec) + "'});" + "</script>", 'metadata': {} } cellInfo = {'method': spec, 'widget': spec['widgets']['input']} cellInfo[self.KB_TYPE] = self.KB_FUNCTION_CELL widgetState = [] if params: wparams = {} for param in params: wparams[param[1]] = param[2] widgetState.append({'state': wparams}) cellInfo[self.KB_STATE] = widgetState cell['metadata'][self.KB_CELL] = cellInfo return cell def _completeNewNarrative(self, workspaceId, objectId, importData): self.ws.alter_workspace_metadata({ 'wsi': { 'id': workspaceId }, 'new': { 'narrative': str(objectId), 'is_temporary': 'true' } }) # copy_to_narrative: if not importData: return objectsToCopy = [{'ref': x} for x in importData] infoList = self.ws.get_object_info_new({ 'objects': objectsToCopy, 'includeMetadata': 0 }) for item in infoList: objectInfo = ServiceUtils.objectInfoToObject(item) self.copy_object(objectInfo['ref'], workspaceId, None, None, objectInfo) def _safeJSONStringify(self, obj): return json.dumps(self._safeJSONStringifyPrepare(obj)) def _safeJSONStringifyPrepare(self, obj): if isinstance(obj, basestring): return obj.replace("'", "'").replace('"', """) elif isinstance(obj, list): for pos in range(len(obj)): obj[pos] = self._safeJSONStringifyPrepare(obj[pos]) elif isinstance(obj, dict): obj_keys = list(obj.keys()) for key in obj_keys: obj[key] = self._safeJSONStringifyPrepare(obj[key]) else: pass # it's boolean/int/float/None return obj def _get_workspace_name_or_id(self, ws_id, ws_name): ret = ws_name if not ret: ret = str(ws_id) return ret def copy_object(self, ref, target_ws_id, target_ws_name, target_name, src_info): # There should be some logic related to DataPalettes if (not target_ws_id) and (not target_ws_name): raise ValueError("Neither target workspace ID nor name is defined") if not src_info: src_info_tuple = self.ws.get_object_info_new({ 'objects': [{ 'ref': ref }], 'includeMetadata': 0 })[0] src_info = ServiceUtils.objectInfoToObject(src_info_tuple) type_name = src_info['typeModule'] + '.' + src_info['typeName'] type_config = self.DATA_PALETTES_TYPES.get(type_name) if type_config is not None: # Copy with DataPaletteService if target_name: raise ValueError( "'target_name' cannot be defined for DataPalette copy") target_ws_name_or_id = self._get_workspace_name_or_id( target_ws_id, target_ws_name) self.dps_cache.call_method("add_to_palette", [{ 'workspace': target_ws_name_or_id, 'new_refs': [{ 'ref': ref }] }], self.token) return {'info': src_info} else: if not target_name: target_name = src_info['name'] obj_info_tuple = self.ws.copy_object({ 'from': { 'ref': ref }, 'to': { 'wsid': target_ws_id, 'workspace': target_ws_name, 'name': target_name } }) obj_info = ServiceUtils.objectInfoToObject(obj_info_tuple) return {'info': obj_info} def list_available_types(self, workspaces): data = self.list_objects_with_sets(workspaces=workspaces)['data'] type_stat = {} for item in data: info = item['object_info'] obj_type = info[2].split('-')[0] if obj_type in type_stat: type_stat[obj_type] += 1 else: type_stat[obj_type] = 1 return {'type_stat': type_stat}
class BallgownUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.rau = ReadsAlignmentUtils(self.callback_url) self.fv = KBaseFeatureValues(self.callback_url) self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev') self.ws = Workspace(self.ws_url, token=self.token) self.scratch = config['scratch'] self.config = config def _xor(self, a, b): return bool(a) != bool(b) def _validate_run_ballgown_app_params(self, params): """ _validate_run_ballgown_app_params: validates params passed to run_ballgown_app method """ log('start validating run_ballgown_app params') # check for required parameters for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix', 'workspace_name']: if p not in params: raise ValueError('"{}" parameter is required, but missing'.format(p)) run_all_combinations = params.get('run_all_combinations') condition_pair_subset = params.get('condition_pair_subset') if not self._xor(run_all_combinations, condition_pair_subset): error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' " error_msg += "or provide subset of condition pairs. Don't provide both, or neither." raise ValueError(error_msg) def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') for file in glob.glob(os.path.join(result_directory, '*.tsv')): shutil.copy(file, output_directory) # volcano_plot exists only if there are two condition groups for file in glob.glob(os.path.join(result_directory, '*.png')): shutil.copy(file, output_directory) diff_expr_set = self.ws.get_objects2({'objects': [{'ref': diff_expression_matrix_set_ref[ 'diffExprMatrixSet_ref']}]})['data'][0] diff_expr_set_data = diff_expr_set['data'] diff_expr_set_info = diff_expr_set['info'] diff_expr_set_name = diff_expr_set_info[1] overview_content = '' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet' overview_content += ' Object</th></tr>' overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name, diff_expression_matrix_set_ref[ 'diffExprMatrixSet_ref']) overview_content += '</td></tr></table>' overview_content += '<p><br/></p>' overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix' overview_content += ' Object</th><th></th><th></th><th></th></tr>' overview_content += '<tr><th>Differential Expression Matrix Name</th>' overview_content += '<th>Condition 1</th>' overview_content += '<th>Condition 2</th>' overview_content += '</tr>' for item in diff_expr_set_data['items']: item_diffexprmatrix_object = self.ws.get_objects2({'objects': [{'ref': item['ref']}]})[ 'data'][0] item_diffexprmatrix_info = item_diffexprmatrix_object['info'] item_diffexprmatrix_data = item_diffexprmatrix_object['data'] diffexprmatrix_name = item_diffexprmatrix_info[1] overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name, item['ref']) overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data. get('condition_mapping').keys()[0]) overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data. get('condition_mapping').values()[0]) overview_content += '</tr>' overview_content += '</table>' # visualization image_content = '' for image in glob.glob(output_directory + "/*.png"): image = image.replace(output_directory + '/', '') caption = image.replace(output_directory + '/', '').replace('.png', '') image_content += '<p style="text-align:center"><img align="center" src="{}" ' \ 'width="600" height="400"></a><a target="_blank"><br>' \ '<p align="center">{}</p></p>'.format( image, caption) with open(result_file_path, 'w') as result_file: with open(os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace('<p>Overview_Content</p>', overview_content) report_template = report_template.replace('<p>Image Gallery</p>', image_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({'file_path': output_directory, 'pack': 'zip'})['shock_id'] html_report.append({'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Ballgown App'}) return html_report def _generate_output_file_list(self, result_directory): """ _generate_output_file_list: zip result files and generate file_links for report """ log('Start packing result files') output_files = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file = os.path.join(output_directory, 'ballgown_result.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for root, dirs, files in os.walk(result_directory): for file in files: if not (file.endswith('.zip') or file.endswith('.png') or file.endswith('.DS_Store')): zip_file.write(os.path.join(root, file), file) output_files.append({'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'File(s) generated by Ballgown App'}) return output_files def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref): """ _generate_report: generate summary report """ log('creating report') output_files = self._generate_output_file_list(result_directory) output_html_files = self._generate_html_report( result_directory, params, diff_expression_matrix_set_ref) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())} kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = {'report_name': output['name'], 'report_ref': output['ref']} return report_output def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels): ngroups = 0 group_name_indices = {} group_counts = {} for group in condition_labels: if not group in group_name_indices: group_name_indices[group] = ngroups ngroups = ngroups + 1 if not group in group_counts: group_counts[group] = 1 else: group_counts[group] = group_counts[group] + 1 # checks for proper ballgown execution: if ngroups < 2: raise Exception("At least two condition groups are needed for this analysis. ") for group in condition_labels: if group_counts[group] < 2: raise Exception( "Condition group {0} has less than 2 members; ballgown will not run. " "At least two condition groups are needed for this analysis. ".format(group)) group_file_dir = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(group_file_dir) try: condition_labels_uniqued = list(set(condition_labels)) sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' + condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1]) sgf = open(sgf_name, "w") except Exception: raise Exception( "Can't open file {0} for writing {1}".format( sgf_name, traceback.format_exc())) index = 0 # condition label index for ii in mapped_expression_ids: for alignment_id, expression_id in ii.items(): expression_object = self.ws.get_objects2( {'objects': [{'ref': expression_id}]})['data'][0] handle_id = expression_object['data']['file']['hid'] expression_name = expression_object['info'][1] expression_dir = os.path.join(group_file_dir, expression_name) self._mkdir_p(expression_dir) print('expression_name: ' + str(expression_dir) + ' ' + str(group_name_indices[condition_labels[index]])) sgf.write("{0} {1}\n".format(expression_dir, group_name_indices[condition_labels[index]])) self.dfu.shock_to_file({'handle_id': handle_id, 'file_path': expression_dir, 'unpack': 'unpack'}) required_files = [ 'e2t.ctab', 'e_data.ctab', 'i2t.ctab', 'i_data.ctab', 't_data.ctab'] for file in glob.glob(expression_dir + '/*'): if not os.path.basename(file) in required_files: os.remove(file) index += 1 return sgf_name def _cleanup(self, directory=None): """ Clean up after the job. At the moment this just means removing the working directory, but later could mean other things. """ try: # it would not delete if fold is not empty shutil.rmtree(directory, ignore_errors=True) # need to iterate each entry except IOError as e: log("Unable to remove working directory {0}".format(directory)) raise def _setupWorkingDir(self, directory=None): """ Clean up an existing workingdir and create a new one """ try: if os.path.exists(directory): self._cleanup(directory) os.mkdir(directory) except IOError: log("Unable to setup working dir {0}".format(directory)) raise def _check_intron_measurements(self, sample_dir_group_table_file): """ Check if intron measurements files are non-empty :param sample_dir_group_table_file: :return: """ log('checking for intron level measurements... ') file = open(sample_dir_group_table_file, 'r') textFileLines = file.readlines() for line in textFileLines: expr_dir = line.split()[0] log(expr_dir) i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r') if len(i2t_file.readlines()) <= 1: # only header line exists raise Exception("No intron measurements found! Input expressions are possibly " "from a prokaryote. Ballgown functions only on eukaryotic data." " Consider using DeSeq2 or CuffDiff instead of BallGown.") idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r') if len(idata_file.readlines()) <= 1: # only header line exists raise Exception("No intron measurements found! Input expressions are possibly " "from a prokaryote. Ballgown functions only on eukaryotic data." " Consider using DeSeq2 or CuffDiff instead of BallGown") def run_ballgown_diff_exp(self, rscripts_dir, sample_dir_group_table_file, ballgown_output_dir, output_csv, volcano_plot_file ): """ Make R call to execute the system :param rscripts_dir: :param sample_dir_group_table_file: :param ballgown_output_dir: sample_group_table is a listing of output Stringtie subdirectories, (full path specification) paired with group label (0 or 1), ie /path/WT_rep1_stringtie 0 /path/WT_rep2_stringtie 0 /path/EXP_rep1_stringtie 1 /path/EXP_rep2_stringtie 1 (order doesn't matter, but the directory-group correspondance does) :param output_csv: :param volcano_plot_file: :return: """ # check if intron-level expression measurements are present self._check_intron_measurements(sample_dir_group_table_file) rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'), '--sample_dir_group_table', sample_dir_group_table_file, '--output_dir', ballgown_output_dir, '--output_csvfile', output_csv, '--volcano_plot_file', volcano_plot_file ] rcmd_str = " ".join(str(x) for x in rcmd_list) log("rcmd_string is {0}".format(rcmd_str)) openedprocess = subprocess.Popen(rcmd_str, shell=True) openedprocess.wait() # Make sure the openedprocess.returncode is zero (0) if openedprocess.returncode != 0: log("R script did not return normally, return code - " + str(openedprocess.returncode)) raise Exception("Rscript failure") def load_diff_expr_matrix(self, ballgown_output_dir, output_csv): """ Reads csv diff expr matrix file from Ballgown and returns as a dictionary of rows with the gene as key. Each key gives a row of length three corresponding to fold_change, pval and qval in string form - can include 'NA' :param ballgown_output_dir :param output_csv: :return: """ diff_matrix_file = os.path.join(ballgown_output_dir, output_csv) if not os.path.isfile(diff_matrix_file): raise Exception("differential expression matrix csvfile {0} doesn't exist!".format( diff_matrix_file)) n = 0 dm = {} with open(diff_matrix_file, "r") as csv_file: csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"') for row in csv_rows: n = n + 1 if (n == 1): if (row != ['id', 'fc', 'pval', 'qval']): raise Exception( "did not get expected column heading from {0}".format( diff_matrix_file)) else: if (len(row) != 4): raise Exception( "did not get 4 elements in row {0} of csv file {1} ".format( n, diff_matrix_file)) key = row[0] # put in checks for NA or numeric for row[1] through 4 if (key in dm): raise Exception( "duplicate key {0} in row {1} of csv file {2} ".format( key, n, diff_matrix_file)) dm[key] = row[1:5] return dm def _transform_expression_set_data(self, expression_set_data): """ The stitch to connect KBaseSets.ExpressionSet-2.0 type data to the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation depends on. This is done by doing a dive into the nested alignment object ref and getting the required data :param expression_set_data: :return: transformed expression_set_data """ transform = dict() # get genome id expression_ref = expression_set_data['items'][0]['ref'] wsid, objid, ver = expression_ref.split('/') expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) transform['genome_id'] = expression_obj[0]['data']['genome_id'] # get sampleset_id #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0] #wsid, objid, ver = alignment_ref.split('/') #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id'] # build mapped_expression_ids mapped_expression_ids = list() for item in expression_set_data['items']: expression_ref = item['ref'] wsid, objid, ver = expression_ref.split('/') expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}]) alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0] mapped_expression_ids.append({alignment_ref: expression_ref}) transform['mapped_expression_ids'] = mapped_expression_ids return transform def _build_condition_label_list(self, mapped_expression_ids): """ Extracts the condition labels from each expression in the specified expression set data and builds a list of condition labels :param expression_set_data: expression set data :return: list of condition labels whose order resembles the expression order in the expression data """ condition_labels = list() for ii in mapped_expression_ids: for alignment_id, expression_id in ii.items(): expression_object = self.ws.get_objects2( {'objects': [{'ref': expression_id}]})['data'][0] condition_labels.append(expression_object['data']['condition']) return condition_labels def _update_output_file_header(self, output_file): """ Modify header of output file (required by DifferentialExpressionUtils) :param output_file: :return: """ f = open(output_file, 'r') filedata = f.read() f.close() modified_output = filedata.replace( '"id"\t"fc"\t"pval"\t"qval"', 'gene_id\tlog2_fold_change\tp_value\tq_value') f = open(output_file, 'w') f.write(modified_output) f.close() def _check_input_labels(self, condition_pair_subset, available_condition_labels): """ _check_input_labels: check input condition pairs """ checked = True # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}] condition_values = set() for condition in condition_pair_subset: condition_values.add(condition['condition']) if len(condition_values) < 2: error_msg = 'At least two unique conditions must be specified. ' raise ValueError(error_msg) for condition in condition_pair_subset: label = condition['condition'].strip() if label not in available_condition_labels: error_msg = 'Condition label "{}" is not a valid condition. '.format(label) error_msg += 'Must be one of "{}"'.format(available_condition_labels) raise ValueError(error_msg) return checked def run_ballgown_app(self, params): """ run_ballgown_app: run Ballgown app (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html) required params: expressionset_ref: ExpressionSet object reference diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet name condition_labels: conditions for expression set object alpha_cutoff: q value cutoff fold_change_cutoff: fold change cutoff workspace_name: the name of the workspace it gets saved to optional params: fold_scale_type: one of ["linear", "log2+1", "log10+1"] return: result_directory: folder path that holds all files generated by run_deseq2_app diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet object reference report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning BallgownUtil.run_ballgown_app\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_ballgown_app_params(params) expressionset_ref = params.get('expressionset_ref') expression_set_info = self.ws.get_object_info3({ "objects": [{"ref": expressionset_ref}]})['infos'][0] expression_object_type = expression_set_info[2] # set output object name differential_expression_suffix = params['diff_expression_matrix_set_suffix'] expression_name = expression_set_info[1] if re.match('.*_[Ee]xpression$', expression_name): params['diff_expression_matrix_set_name'] = re.sub( '_[Ee]xpression$', differential_expression_suffix, expression_name) if re.match('.*_[Ee]xpression_[Ss]et$', expression_name): params['diff_expression_matrix_set_name'] = re.sub( '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name) else: params['diff_expression_matrix_set_name'] = expression_name + \ differential_expression_suffix log('--->\nexpression object type: \n' + '{}'.format(expression_object_type)) if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type): expression_set_data = self.ws.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0]['data'] elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type): expression_set_data = self.ws.get_objects2( {'objects': [{'ref': expressionset_ref}]})['data'][0]['data'] expression_set_data = self._transform_expression_set_data(expression_set_data) mgroup = MultiGroup(self.ws) pairwise_mapped_expression_ids = mgroup.build_pairwise_groups( expression_set_data['mapped_expression_ids']) ballgown_output_dir = os.path.join(self.scratch, "ballgown_out") log("ballgown output dir is {0}".format(ballgown_output_dir)) self._setupWorkingDir(ballgown_output_dir) # get set of all condition labels available_condition_labels = \ self._build_condition_label_list(expression_set_data['mapped_expression_ids']) if params.get('run_all_combinations'): requested_condition_labels = available_condition_labels else: # get set of user specified condition labels condition_pair_subset = params.get('condition_pair_subset') if self._check_input_labels(condition_pair_subset, available_condition_labels): requested_condition_labels = list() # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}] for condition in condition_pair_subset: if condition.get('condition').strip() not in requested_condition_labels: requested_condition_labels.append(condition.get('condition').strip()) log("User requested pairwise combinations from condition label list : " + str(requested_condition_labels)) diff_expr_files = list() for mapped_expression_ids in pairwise_mapped_expression_ids: print('processing pairwise combination: ') pprint(mapped_expression_ids) print('with condtion labels: ') condition_labels = self._build_condition_label_list(mapped_expression_ids) pprint(condition_labels) # skip if condition labels in this pairwise combination don't exist in # set of user requested condition labels skip = False for condition in condition_labels: if condition not in requested_condition_labels: log("skipping " + str(condition_labels)) skip = True if skip: continue sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids, condition_labels) log("about to run_ballgown_diff_exp") rscripts_dir = '/kb/module/rscripts' condition_labels_uniqued = list() for condition in condition_labels: if condition not in condition_labels_uniqued: condition_labels_uniqued.append(condition) output_csv = 'ballgown_diffexp_' + \ condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv' volcano_plot_file = 'volcano_plot_' + \ condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png' self.run_ballgown_diff_exp(rscripts_dir, sample_dir_group_file, ballgown_output_dir, output_csv, volcano_plot_file) log("back from run_ballgown_diff_exp, about to load diff exp matrix file") # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir, # output_csv) # read file before its zipped self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv)) diff_expr_file = dict() diff_expr_file.update({'condition_mapping': {condition_labels_uniqued[0]: condition_labels_uniqued[1]}}) diff_expr_file.update( {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)}) diff_expr_files.append(diff_expr_file) deu_param = { 'destination_ref': params['workspace_name'] + '/' + params['diff_expression_matrix_set_name'], 'diffexpr_data': diff_expr_files, 'tool_used': TOOL_NAME, 'tool_version': TOOL_VERSION, 'genome_ref': expression_set_data.get('genome_id'), } diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set( deu_param) returnVal = {'result_directory': ballgown_output_dir, 'diff_expression_matrix_set_ref': diff_expression_matrix_set_ref['diffExprMatrixSet_ref']} report_output = self._generate_report(params, ballgown_output_dir, diff_expression_matrix_set_ref) returnVal.update(report_output) return returnVal
class MutualInfoUtil: def __init__(self, config): self.ws_url = config["workspace-url"] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.dfu = DataFileUtil(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) self.scratch = config['scratch'] def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_flux_mutual_information_analysis_params(self, params): """ _validate_run_flux_mutual_information_analysis_params: validates params passed to run_flux_mutual_information_analysis method """ log('start validating validate_run_flux_mutual_information_analysis params' ) # check for required parameters for p in ['fbamodel_id', 'compounds', 'media_id', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _get_file_from_ws(self, workspace, obj_name): try: file_path = self.ws.get_objects([{ 'name': obj_name, 'workspace': workspace }])[0] except Exception as e: raise ValueError('Unable to get object from workspace: (' + workspace + '/' + obj_name + ')' + str(e)) return file_path def _make_media_files(self, ws_name, base, compounds): """ Build and store media objects for each combination of compound added to the base media. :param base: The base media file :param compounds: the set of compound to test :return: A list of media ids and a matrix with each media combination defined """ base_media = self._get_file_from_ws(ws_name, base)['data'] media_ids = [base_media['id']] new_media_list = [] media_matrix = [[""] + compounds] media_matrix.append([base_media['id'] + [0] * len(compounds)]) for n_comp in range(1, len(compounds) + 1): for combo in combinations(compounds, n_comp): new_media_id = base_media['id'] + '_v%s' % len(media_matrix) media_ids.append(new_media_id) media_matrix.append( [new_media_id] + [1 if comp in combo else 0 for comp in compounds]) new_media = deepcopy(base_media) new_media['id'] = new_media_id new_media['name'] = new_media_id for new_comp in combo: new_media['mediacompounds'].append({ 'compound_ref': '48/1/1/compounds/id/%s' % new_comp.split('_')[0], 'concentration': 1.0, 'maxFlux': 1000, 'minFlux': -1000 }) new_media_list.append(new_media) print("Made %s Media Files" % len(media_ids) - 1) info = self.ws.save_objects({ 'workspace': ws_name, "objects": [{ "type": "KBaseBiochem.Media", "data": media, "name": media['name'] } for media in new_media_list] }) print info return media_ids, media_matrix def _generate_html_report(self, result_directory, mutual_info_dict): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'mutual_information_report.html') shutil.copy(os.path.join(result_directory, 'MI_plot.png'), os.path.join(output_directory, 'MI_plot.png')) overview_content = '' overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations' overview_content += ' Object</th></td>' overview_content += '<tr><th>Input Chemical Compound Combination</th>' overview_content += '<th>Mutual Information (in Bits)</th>' overview_content += '</tr>' for k, v in mutual_info_dict.items(): overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v) overview_content += '</table>' with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<p>Overview_Content</p>', overview_content) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Mutual Information App' }) return html_report def _generate_report(self, result_directory, mutual_info_dict, params): """ _generate_report: generate summary report """ log('creating report') output_html_files = self._generate_html_report(result_directory, mutual_info_dict) report_params = { 'message': '', 'workspace_name': params.get('workspace_name'), 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': 'MutualInfomation_report_' + str(uuid.uuid4()) } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_mutual_info(self, media_matrix, fba_file): df1 = pd.read_csv(fba_file) df1.as_matrix() #----Input validation of Media/FBAs with Binary Matrix FBAs------ # 1.0 Number of rows in Media.csv file = (Number of columns -1) # 1.0. If they are different: Through an ERROR saying missed match number of FBAs in media and binary matrix. # 1.1 Check whether the elements in Media.csv file contains only binary values (i.e. 0 and 1) # 1.1. If the elements are different: Through an ERROR saying not approapriate input values # 1.2 Check whether the compounds in Media.csv file match with number of FBAs # 1.2. If the compounds are different from number of FBAs: Through an ERROR saying not appropriate input values s_df1 = df1.shape s_df2 = media_matrix.shape Temp_df2 = np.array(media_matrix.values) # Create matrix with only the elements remove first column and all the rows Temp_df2 = Temp_df2[0:, 1:] Bin_val_check = np.array_equal(Temp_df2, Temp_df2.astype(bool)) num_compounds = (s_df2[1]) - 1 if ((s_df1[1] - 1) != s_df2[0]) or (Bin_val_check != True) or (int( math.log(s_df2[0], 2)) != num_compounds): print('invalid input values') #-----All possible combination of the chemical compounds---------------------- # 2.0 Sperating m0 from rest of the lables Temp1_df2 = media_matrix cols = Temp1_df2.columns for i in range(1, len(cols)): Temp1_df2.loc[Temp1_df2[cols[i]] == 1, cols[i]] = cols[i] print Temp1_df2 # 2.1 Creating a disctionary for all FBAs except m0 print len(Temp1_df2) mydict = {} for x in range(0, len(Temp1_df2)): for i in range(1, s_df2[1]): currentvalue = Temp1_df2.iloc[x, i] currentid = Temp1_df2.iloc[x, 0] currentvalue = Temp1_df2.iloc[x, i] mydict.setdefault(currentid, []) if currentvalue > 0: mydict[currentid].append(currentvalue) # Add the first key as m0 media_0_name = 'm0' mydict[media_0_name] = "['0']" #Sort the keys mydict = collections.OrderedDict(natsort.natsorted(mydict.items())) print mydict for k, v in mydict.iteritems(): print k, v # List of Compounds combination in the list my_combi_list = [] Compounds_Combi = list(range(1, num_compounds + 1)) for L in range(0, len(Compounds_Combi) + 1): for subset in itertools.combinations(Compounds_Combi, L): my_combi_list.append(list(subset)) print my_combi_list # Created a dictionary where the keys: # list of compounds combination # values are corresponding FBAs list in df2 result_dict = {} for element in my_combi_list[1:]: for k, v in mydict.iteritems(): if set(v).issubset(set(map(lambda x: str(x), element))): key = ','.join(map(lambda x: str(x), element)) if result_dict.get(key): media_list = result_dict[key] media_list.append(k) media_list = list(set(media_list)) result_dict.update({key: media_list}) else: result_dict.update({key: [media_0_name, k]}) print result_dict # Created a dictionary where the keys are: # list of compounds combination # values are compounds combination FBAs with df1 vaules All_Comp_Combi_dic = {} for column, value in result_dict.items(): All_Comp_Combi_dic.update({column: df1.get(value)}) #To print an item from the All_Comp_Combi_dic df = (pd.DataFrame(All_Comp_Combi_dic.items())) #print df[0] #print df[1][7] MI_dict = {} for k in range(0, len(df[0])): drop_rows_df = df[1][k].drop_duplicates(keep="first") drop_columns_df = drop_rows_df.T.drop_duplicates(keep="first").T remove = [] removed = {} cols = df[1][k].columns for i in range(len(cols) - 1): duplicated = [] v = df[1][k][cols[i]].values for j in range(i + 1, len(cols)): if np.array_equal(v, df[1][k][cols[j]].values): remove.append(cols[j]) duplicated.append(cols[j]) if duplicated and cols[i] not in remove: removed.update({cols[i]: duplicated}) count = {} for key, value in removed.items(): count.update({key: len(value)}) #print v # print drop_columns_df values = count.values() # print values values = map(lambda x: x + 1, values) # print values d = {x: values.count(x) for x in values} #-------Mutual Inforamtion (MI) calculation------------- FBAs = len(df[1][k].columns) pure_entropy = math.log(FBAs, 2) #print pure_entropy # If No duplicates exist and list "value" is empty if not values: #print("List is empty") No_duplicate_FBAs = len(drop_columns_df.columns) conditional_entropy = -1 * (No_duplicate_FBAs * ( (1 / No_duplicate_FBAs) * ((1 / 1) * math.log(1.0 / 1.0, 2)))) Mutual_Info = pure_entropy - conditional_entropy #print('Mutaul Info:', Mutual_Info) if values: # If duplicates exist and list "value" is not empty conditional_entropy = 0 for key in d: #print key, d[key] Temp = -1 * d[key] * (key / float(FBAs)) * key * ( 1.0 / key) * math.log(1.0 / key, 2) conditional_entropy = Temp + conditional_entropy #print "%3f" %Temp Mutual_Info = pure_entropy - conditional_entropy MI_dict[df[0][k]] = Mutual_Info #Sorted MI_dict MI_dict = sorted(MI_dict.items(), key=lambda x: (-len(x[0]), x[0])) MI_dict = OrderedDict(MI_dict) print("Plot MI_dict") plt.bar(range(len(MI_dict)), MI_dict.values(), align='center', alpha=0.5, width=0.7) plt.xticks(range(len(MI_dict)), MI_dict.keys(), rotation='vertical') plt.xlabel('Compund Combinations') plt.ylabel('Mutual Information (in Bits)') plt.title("Organism:XYZ") fig1 = plt.gcf() fig1.savefig(os.path.join(self.scratch, 'MI_plot.png'), dpi=100) return MI_dict
class FunctionalEnrichmentUtil: def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _validate_run_fe1_params(self, params): """ _validate_run_fe1_params: validates params passed to run_fe1 method """ log('start validating run_fe1 params') # check for required parameters for p in ['feature_set_ref', 'workspace_name']: if p not in params: raise ValueError( '"{}" parameter is required, but missing'.format(p)) def _generate_report(self, enrichment_map, result_directory, workspace_name, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_report: generate summary report """ log('start creating report') output_files = self._generate_output_file_list( result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) output_html_files = self._generate_html_report(result_directory, enrichment_map) report_object_name = 'kb_functional_enrichment_1_report_' + str( uuid.uuid4()) report_params = { 'message': '', 'workspace_name': workspace_name, 'file_links': output_files, 'html_links': output_html_files, 'direct_html_link_index': 0, 'html_window_height': 333, 'report_object_name': report_object_name } kbase_report_client = KBaseReport(self.callback_url) output = kbase_report_client.create_extended_report(report_params) report_output = { 'report_name': output['name'], 'report_ref': output['ref'] } return report_output def _generate_supporting_files(self, result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_supporting_files: generate varies debug files """ supporting_files = list() feature_id_go_ids_map_file = os.path.join(result_directory, 'feature_id_go_ids_map.txt') go_id_genome_feature_ids_map_file = os.path.join( result_directory, 'go_id_genome_feature_ids_map.txt') go_id_set_feature_ids_map_file = os.path.join( result_directory, 'go_id_feature_set_feature_ids_map.txt') feature_ids_file = os.path.join(result_directory, 'feature_ids.txt') feature_set_ids_file = os.path.join(result_directory, 'feature_set_ids.txt') fisher_variables_file = os.path.join(result_directory, 'fisher_variables.txt') genome_info_file = os.path.join(result_directory, 'genome_info.txt') go_id_parent_ids_map_file = os.path.join(result_directory, 'go_id_parent_ids_map.txt') supporting_files.append(feature_id_go_ids_map_file) supporting_files.append(go_id_genome_feature_ids_map_file) supporting_files.append(feature_ids_file) supporting_files.append(feature_set_ids_file) supporting_files.append(fisher_variables_file) supporting_files.append(genome_info_file) supporting_files.append(go_id_parent_ids_map_file) supporting_files.append(go_id_set_feature_ids_map_file) total_feature_ids = feature_id_go_id_list_map.keys() feature_ids_with_feature = [] for feature_id, go_ids in feature_id_go_id_list_map.iteritems(): if isinstance(go_ids, list): feature_ids_with_feature.append(feature_id) genome_name = self.ws.get_object_info3( {'objects': [{ 'ref': genome_ref }]})['infos'][0][1] with open(go_id_parent_ids_map_file, 'wb') as go_id_parent_ids_map_file: for go_id, parent_ids in go_id_parent_ids_map.iteritems(): go_id_parent_ids_map_file.write('{}: {}\n'.format( go_id, ', '.join(parent_ids))) with open(genome_info_file, 'wb') as genome_info_file: genome_info_file.write('genome_name: {}\n'.format(genome_name)) genome_info_file.write('features: {}\n'.format( len(total_feature_ids))) genome_info_file.write('features with term: {}'.format( len(feature_ids_with_feature))) with open(feature_set_ids_file, 'wb') as feature_set_ids_file: feature_set_ids_file.write('\n'.join(feature_set_ids)) with open(feature_id_go_ids_map_file, 'wb') as feature_id_go_ids_map_file: with open(feature_ids_file, 'wb') as feature_ids_file: for feature_id, go_ids in feature_id_go_id_list_map.iteritems( ): feature_ids_file.write('{} {}\n'.format( feature_id, feature_id in feature_set_ids)) if isinstance(go_ids, str): feature_id_go_ids_map_file.write('{} {}\n'.format( feature_id, go_ids)) else: feature_id_go_ids_map_file.write('{} {}\n'.format( feature_id, ', '.join(go_ids))) with open(go_id_genome_feature_ids_map_file, 'wb') as go_id_genome_feature_ids_map_file: with open(go_id_set_feature_ids_map_file, 'wb') as go_id_set_feature_ids_map_file: with open(fisher_variables_file, 'wb') as fisher_variables_file: for go_id, go_info in enrichment_map.iteritems(): mapped_features = go_info.get('mapped_features') fs_mapped_features = list( set(mapped_features).intersection(feature_set_ids)) mapped_features_line = '{}: {}\n'.format( go_id, ', '.join(mapped_features)) go_id_genome_feature_ids_map_file.write( mapped_features_line) set_mapped_features_line = '{}: {}\n'.format( go_id, ', '.join(fs_mapped_features)) go_id_set_feature_ids_map_file.write( set_mapped_features_line) a_value = go_info.get('num_in_subset_feature_set') b_value = len(feature_set_ids) - a_value c_value = len(mapped_features) - a_value d_value = len(feature_ids) - len( feature_set_ids) - c_value p_value = go_info.get('raw_p_value') fisher_variables_file.write( '{} a:{} b:{} c:{} d:{} '.format( go_id, a_value, b_value, c_value, d_value)) fisher_variables_file.write( 'p_value:{}\n'.format(p_value)) result_file = os.path.join(result_directory, 'supporting_files.zip') with zipfile.ZipFile(result_file, 'w', zipfile.ZIP_DEFLATED, allowZip64=True) as zip_file: for supporting_file in supporting_files: zip_file.write(supporting_file, os.path.basename(supporting_file)) return [{ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'GO term functional enrichment supporting files' }] def _generate_output_file_list(self, result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids): """ _generate_output_file_list: zip result files and generate file_links for report """ log('start packing result files') output_files = list() result_file = os.path.join(result_directory, 'functional_enrichment.csv') with open(result_file, 'wb') as csv_file: writer = csv.writer(csv_file) writer.writerow([ 'term_id', 'term', 'ontology', 'num_in_feature_set', 'num_in_ref_genome', 'raw_p_value', 'adjusted_p_value' ]) for key, value in enrichment_map.iteritems(): writer.writerow([ key, value['go_term'], value['namespace'], value['num_in_subset_feature_set'], value['num_in_ref_genome'], value['raw_p_value'], value['adjusted_p_value'] ]) output_files.append({ 'path': result_file, 'name': os.path.basename(result_file), 'label': os.path.basename(result_file), 'description': 'GO term functional enrichment' }) supporting_files = self._generate_supporting_files( result_directory, enrichment_map, feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) output_files += supporting_files return output_files def _generate_html_report(self, result_directory, enrichment_map): """ _generate_html_report: generate html summary report """ log('start generating html report') html_report = list() output_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(output_directory) result_file_path = os.path.join(output_directory, 'report.html') enrichment_table = '' data = csv.DictReader(open( os.path.join(result_directory, 'functional_enrichment.csv')), delimiter=',') sortedlist = sorted( data, key=lambda row: (float(row['adjusted_p_value']), float(row['raw_p_value']), float(row['num_in_ref_genome'])), reverse=False) for row in sortedlist: # if row['num_in_feature_set'] != '0': enrichment_table += '<tr><td>{}</td>'.format(row['term_id']) enrichment_table += '<td>{}</td>'.format(row['term']) enrichment_table += '<td>{}</td>'.format(row['ontology']) enrichment_table += '<td>{}</td>'.format(row['num_in_feature_set']) enrichment_table += '<td>{}</td>'.format(row['num_in_ref_genome']) enrichment_table += '<td>{}</td>'.format(row['raw_p_value']) enrichment_table += '<td>{}</td></tr>'.format( row['adjusted_p_value']) with open(result_file_path, 'w') as result_file: with open( os.path.join(os.path.dirname(__file__), 'report_template.html'), 'r') as report_template_file: report_template = report_template_file.read() report_template = report_template.replace( '<tr>Enrichment_Table</tr>', enrichment_table) result_file.write(report_template) report_shock_id = self.dfu.file_to_shock({ 'file_path': output_directory, 'pack': 'zip' })['shock_id'] html_report.append({ 'shock_id': report_shock_id, 'name': os.path.basename(result_file_path), 'label': os.path.basename(result_file_path), 'description': 'HTML summary report for Functional Enrichment App' }) return html_report def _get_go_maps_from_genome(self, genome_ref): """ _search_genome: search genome data """ log('start parsing GO terms from genome') feature_num = self.gsu.search({'ref': genome_ref})['num_found'] genome_features = self.gsu.search({ 'ref': genome_ref, 'limit': feature_num, 'sort_by': [['feature_id', True]] })['features'] feature_id_go_id_list_map = {} go_id_feature_id_list_map = {} go_id_go_term_map = {} feature_id_feature_info_map = {} for genome_feature in genome_features: feature_id = genome_feature.get('feature_id') feature_func = genome_feature.get('function') feature_type = genome_feature.get('feature_type') ontology_terms = genome_feature.get('ontology_terms') feature_id_feature_info_map.update({ feature_id: { 'function': feature_func, 'feature_type': feature_type } }) go_id_list = [] if ontology_terms: for ontology_id, ontology_term in ontology_terms.iteritems(): if re.match('[gG][oO]\:.*', ontology_id): go_id_go_term_map.update({ontology_id: ontology_term}) go_id_list.append(ontology_id) if go_id_list: feature_id_go_id_list_map.update({feature_id: go_id_list}) for go_id in go_id_list: if go_id in go_id_feature_id_list_map: feature_ids = go_id_feature_id_list_map.get(go_id) feature_ids.append(feature_id) go_id_feature_id_list_map.update({go_id: feature_ids}) else: go_id_feature_id_list_map.update({go_id: [feature_id]}) else: feature_id_go_id_list_map.update({feature_id: 'Unlabeled'}) return (feature_id_go_id_list_map, go_id_feature_id_list_map, go_id_go_term_map, feature_id_feature_info_map) def _process_feature_set(self, feature_set_ref): """ _process_feature_set: process FeatureSet object return: genome_ref: reference Genome object ref feature_set_ids: FeatureSet feature ids """ log('start processing FeatureSet object') feature_set_data = self.ws.get_objects2( {'objects': [{ 'ref': feature_set_ref }]})['data'][0]['data'] feature_elements = feature_set_data['elements'] feature_set_ids = [] genome_ref_array = [] for feature_id, genome_refs in feature_elements.iteritems(): feature_set_ids.append(feature_id) genome_ref_array += genome_refs if len(set(genome_ref_array)) > 1: error_msg = 'FeatureSet has multiple reference Genomes: {}'.format( genome_ref_array) raise ValueError(error_msg) return feature_set_ids, genome_ref_array[0] def _get_immediate_parents(self, ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship): """ _get_immediate_parents: get immediate parents go_ids for a given go_id """ parent_ids = [] antology_info = ontology_hash.get(go_id, {}) if is_a_relationship: is_a_parents = antology_info.get('is_a') if is_a_parents: for parent_string in is_a_parents: is_a_parent_id = parent_string.split('!')[0][:-1] parent_ids.append(is_a_parent_id) if regulates_relationship: relationship = antology_info.get('relationship') if relationship: for relationship_string in relationship: if relationship_string.split(' ')[0] == 'regulates': parent_ids.append(relationship_string.split(' ')[1]) if part_of_relationship: relationship = antology_info.get('relationship') if relationship: for relationship_string in relationship: if relationship_string.split(' ')[0] == 'part_of': parent_ids.append(relationship_string.split(' ')[1]) return parent_ids def _fetch_all_parents_go_ids(self, ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship): """ _fetch_all_parents_go_ids: recusively fetch all parent go_ids """ parent_ids = self._get_immediate_parents(ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship) if parent_ids: grand_parent_ids = parent_ids for parent_id in parent_ids: grand_parent_ids += self._fetch_all_parents_go_ids( ontology_hash, parent_id, is_a_relationship, regulates_relationship, part_of_relationship)[parent_id] return {go_id: list(set(grand_parent_ids))} else: return {go_id: []} def _generate_parent_child_map(self, ontology_hash, go_ids, is_a_relationship=True, regulates_relationship=True, part_of_relationship=False): """ _generate_parent_child_map: fetch parent go_ids for given go_id """ log('start fetching parent go_ids') start = time.time() go_id_parent_ids_map = {} for go_id in go_ids: fetch_result = self._fetch_all_parents_go_ids( ontology_hash, go_id, is_a_relationship, regulates_relationship, part_of_relationship) go_id_parent_ids_map.update(fetch_result) end = time.time() print('used {:.2f} s'.format(end - start)) return go_id_parent_ids_map def _round(self, number, digits=3): """ round number to given digits """ round_number = format(number, '.{}g'.format(digits)) return round_number def __init__(self, config): self.ws_url = config['workspace-url'] self.callback_url = config['SDK_CALLBACK_URL'] self.token = config['KB_AUTH_TOKEN'] self.shock_url = config['shock-url'] self.scratch = config['scratch'] self.dfu = DataFileUtil(self.callback_url) self.gsu = GenomeSearchUtil(self.callback_url) self.ws = Workspace(self.ws_url, token=self.token) def run_fe1(self, params): """ run_fe1: Functional Enrichment One required params: feature_set_ref: FeatureSet object reference workspace_name: the name of the workspace it gets saved to optional params: propagation: includes is_a relationship to all go terms (default is 1) filter_ref_features: filter reference genome features with no go terms (default is 0) statistical_significance: parameter for statistical significance. Select one from left_tailed, right_tailed or two_tailed (default is left_tailed) ignore_go_term_not_in_feature_set: ignore Go term analysis if term is not associated with FeatureSet (default is 1) return: result_directory: folder path that holds all files generated by run_deseq2_app report_name: report name generated by KBaseReport report_ref: report reference generated by KBaseReport """ log('--->\nrunning FunctionalEnrichmentUtil.run_fe1\n' + 'params:\n{}'.format(json.dumps(params, indent=1))) self._validate_run_fe1_params(params) propagation = params.get('propagation', True) filter_ref_features = params.get('filter_ref_features', False) statistical_significance = params.get('statistical_significance', 'left_tailed') ignore_go_term_not_in_feature_set = params.get( 'ignore_go_term_not_in_feature_set', True) result_directory = os.path.join(self.scratch, str(uuid.uuid4())) self._mkdir_p(result_directory) feature_set_ids, genome_ref = self._process_feature_set( params.get('feature_set_ref')) (feature_id_go_id_list_map, go_id_feature_id_list_map, go_id_go_term_map, feature_id_feature_info_map ) = self._get_go_maps_from_genome(genome_ref) if filter_ref_features: log('start filtering featrues with no term') feature_ids = [] for feature_id, go_ids in feature_id_go_id_list_map.iteritems(): if isinstance(go_ids, list): feature_ids.append(feature_id) else: feature_ids = feature_id_go_id_list_map.keys() ontology_hash = dict() ontologies = self.ws.get_objects([{ 'workspace': 'KBaseOntology', 'name': 'gene_ontology' }, { 'workspace': 'KBaseOntology', 'name': 'plant_ontology' }]) ontology_hash.update(ontologies[0]['data']['term_hash']) ontology_hash.update(ontologies[1]['data']['term_hash']) if propagation: go_id_parent_ids_map = self._generate_parent_child_map( ontology_hash, go_id_go_term_map.keys(), regulates_relationship=False) else: go_id_parent_ids_map = {} for go_id in go_id_go_term_map.keys(): go_id_parent_ids_map.update({go_id: []}) log('including parents to feature id map') for go_id, parent_ids in go_id_parent_ids_map.iteritems(): mapped_features = go_id_feature_id_list_map.get(go_id) for parent_id in parent_ids: parent_mapped_features = go_id_feature_id_list_map.get( parent_id) if not parent_mapped_features: parent_mapped_features = [] if mapped_features: parent_mapped_features += mapped_features go_id_feature_id_list_map.update( {parent_id: list(set(parent_mapped_features))}) log('start calculating p-values') enrichment_map = {} go_info_map = {} all_raw_p_value = [] pos = 0 for go_id, go_term in go_id_go_term_map.iteritems(): mapped_features = go_id_feature_id_list_map.get(go_id) # in feature_set matches go_id a = len(set(mapped_features).intersection(feature_set_ids)) # ignore go term analysis if not associated with FeatureSet if ignore_go_term_not_in_feature_set and a == 0: continue # in feature_set doesn't match go_id b = len(feature_set_ids) - a # not in feature_set matches go_id c = len(mapped_features) - a # not in feature_set doesn't match go_id d = len(feature_ids) - len(feature_set_ids) - c fisher_value = fisher.pvalue(a, b, c, d) if statistical_significance == 'left_tailed': raw_p_value = self._round(fisher_value.left_tail) elif statistical_significance == 'right_tailed': raw_p_value = self._round(fisher_value.right_tail) elif statistical_significance == 'two_tailed': raw_p_value = self._round(fisher_value.two_tail) else: raise ValueError('Improper statistical_significance value') all_raw_p_value.append(raw_p_value) go_info_map.update({ go_id: { 'raw_p_value': raw_p_value, 'num_in_ref_genome': len(mapped_features), 'num_in_subset_feature_set': a, 'pos': pos, 'mapped_features': mapped_features } }) pos += 1 stats = importr('stats') adjusted_p_values = stats.p_adjust(FloatVector(all_raw_p_value), method='fdr') for go_id, go_info in go_info_map.iteritems(): if go_id not in ontology_hash: continue adjusted_p_value = self._round( adjusted_p_values[go_info.get('pos')]) namespace = ontology_hash[go_id]['namespace'] enrichment_map.update({ go_id: { 'raw_p_value': go_info.get('raw_p_value'), 'adjusted_p_value': adjusted_p_value, 'num_in_ref_genome': go_info.get('num_in_ref_genome'), 'num_in_subset_feature_set': go_info.get('num_in_subset_feature_set'), 'go_term': go_id_go_term_map.get(go_id), 'namespace': namespace.split("_")[1][0].upper(), 'mapped_features': go_info.get('mapped_features') } }) returnVal = {'result_directory': result_directory} report_output = self._generate_report(enrichment_map, result_directory, params.get('workspace_name'), feature_id_go_id_list_map, feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids) returnVal.update(report_output) return returnVal