class NarrativeManager:

    KB_CELL = 'kb-cell'
    KB_TYPE = 'type'
    KB_APP_CELL = 'kb_app'
    KB_FUNCTION_CELL = 'function_input'
    KB_OUTPUT_CELL = 'function_output'
    KB_ERROR_CELL = 'kb_error'
    KB_CODE_CELL = 'kb_code'
    KB_STATE = 'widget_state'

    DEBUG = False

    DATA_PALETTES_TYPES = DataPaletteTypes(False)

    def __init__(self, config, ctx, set_api_cache, dps_cache):
        self.narrativeMethodStoreURL = config['narrative-method-store']
        self.set_api_cache = set_api_cache  # DynamicServiceCache type
        self.dps_cache = dps_cache  # DynamicServiceCache type
        self.token = ctx["token"]
        self.user_id = ctx["user_id"]
        self.ws = Workspace(config['workspace-url'], token=self.token)
        self.intro_md_file = config['intro-markdown-file']
        # We switch DPs on only for internal Continuous Integration environment for now:
        if config['kbase-endpoint'].startswith("https://ci.kbase.us/"):
            self.DATA_PALETTES_TYPES = DataPaletteTypes(True)

    def list_objects_with_sets(self,
                               ws_id=None,
                               ws_name=None,
                               workspaces=None,
                               types=None,
                               include_metadata=0):
        if not workspaces:
            if (not ws_id) and (not ws_name):
                raise ValueError(
                    "One and only one of 'ws_id', 'ws_name', 'workspaces' " +
                    "parameters should be set")
            workspaces = [self._get_workspace_name_or_id(ws_id, ws_name)]
        return self._list_objects_with_sets(workspaces, types,
                                            include_metadata)

    def _list_objects_with_sets(self, workspaces, types, include_metadata):
        type_map = None
        if types is not None:
            type_map = {key: True for key in types}

        processed_refs = {}
        data = []
        if self.DEBUG:
            print("NarrativeManager._list_objects_with_sets: processing sets")
        t1 = time.time()
        set_ret = self.set_api_cache.call_method(
            "list_sets", [{
                'workspaces': workspaces,
                'include_set_item_info': 1,
                'include_raw_data_palettes': 1,
                'include_metadata': include_metadata
            }], self.token)
        sets = set_ret['sets']
        dp_data = set_ret.get('raw_data_palettes')
        dp_refs = set_ret.get('raw_data_palette_refs')
        for set_info in sets:
            # Process
            target_set_items = []
            for set_item in set_info['items']:
                target_set_items.append(set_item['info'])
            if self._check_info_type(set_info['info'], type_map):
                data_item = {
                    'object_info': set_info['info'],
                    'set_items': {
                        'set_items_info': target_set_items
                    }
                }
                data.append(data_item)
                processed_refs[set_info['ref']] = data_item
        if self.DEBUG:
            print("    (time=" + str(time.time() - t1) + ")")

        if self.DEBUG:
            print("NarrativeManager._list_objects_with_sets: loading ws_info")
        t2 = time.time()
        ws_info_list = []
        #for ws in workspaces:
        if len(workspaces) == 1:
            ws = workspaces[0]
            ws_id = None
            ws_name = None
            if str(ws).isdigit():
                ws_id = int(ws)
            else:
                ws_name = str(ws)
            ws_info_list.append(
                self.ws.get_workspace_info({
                    "id": ws_id,
                    "workspace": ws_name
                }))
        else:
            ws_map = {key: True for key in workspaces}
            for ws_info in self.ws.list_workspace_info({'perm': 'r'}):
                if ws_info[1] in ws_map or str(ws_info[0]) in ws_map:
                    ws_info_list.append(ws_info)
        if self.DEBUG:
            print("    (time=" + str(time.time() - t2) + ")")

        if self.DEBUG:
            print(
                "NarrativeManager._list_objects_with_sets: loading workspace objects"
            )
        t3 = time.time()
        for info in WorkspaceListObjectsIterator(
                self.ws,
                ws_info_list=ws_info_list,
                list_objects_params={'includeMetadata': include_metadata}):
            item_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            if item_ref not in processed_refs and self._check_info_type(
                    info, type_map):
                data_item = {'object_info': info}
                data.append(data_item)
                processed_refs[item_ref] = data_item
        if self.DEBUG:
            print("    (time=" + str(time.time() - t3) + ")")

        if self.DEBUG:
            print(
                "NarrativeManager._list_objects_with_sets: processing DataPalettes"
            )
        t5 = time.time()
        if dp_data is None or dp_refs is None:
            dps = self.dps_cache
            dp_ret = dps.call_method("list_data",
                                     [{
                                         'workspaces': workspaces,
                                         'include_metadata': include_metadata
                                     }], self.token)
            dp_data = dp_ret['data']
            dp_refs = dp_ret['data_palette_refs']
        for item in dp_data:
            ref = item['ref']
            if self._check_info_type(item['info'], type_map):
                data_item = None
                if ref in processed_refs:
                    data_item = processed_refs[ref]
                else:
                    data_item = {'object_info': item['info']}
                    processed_refs[ref] = data_item
                    data.append(data_item)
                dp_info = {}
                if 'dp_ref' in item:
                    dp_info['ref'] = item['dp_ref']
                if 'dp_refs' in item:
                    dp_info['refs'] = item['dp_refs']
                data_item['dp_info'] = dp_info
        if self.DEBUG:
            print("    (time=" + str(time.time() - t5) + ")")
        return {"data": data, 'data_palette_refs': dp_refs}

    def _check_info_type(self, info, type_map):
        if type_map is None:
            return True
        obj_type = info[2].split('-')[0]
        return type_map.get(obj_type, False)

    def copy_narrative(self, newName, workspaceRef, workspaceId):
        time_ms = int(round(time.time() * 1000))
        newWsName = self.user_id + ':narrative_' + str(time_ms)
        # add the 'narrative' field to newWsMeta later.
        newWsMeta = {"is_temporary": "false", "narrative_nice_name": newName}

        # start with getting the existing narrative object.
        currentNarrative = self.ws.get_objects([{'ref': workspaceRef}])[0]
        if not workspaceId:
            workspaceId = currentNarrative['info'][6]
        # Let's prepare exceptions for clone the workspace.
        # 1) currentNarrative object:
        excluded_list = [{'objid': currentNarrative['info'][0]}]
        # 2) let's exclude objects of types under DataPalette handling:
        data_palette_type = "DataPalette.DataPalette"
        excluded_types = [data_palette_type]
        excluded_types.extend(self.DATA_PALETTES_TYPES.keys())
        add_to_palette_list = []
        dp_detected = False
        for obj_type in excluded_types:
            list_objects_params = {'type': obj_type}
            if obj_type == data_palette_type:
                list_objects_params['showHidden'] = 1
            for info in WorkspaceListObjectsIterator(
                    self.ws,
                    ws_id=workspaceId,
                    list_objects_params=list_objects_params):
                if obj_type == data_palette_type:
                    dp_detected = True
                else:
                    add_to_palette_list.append({
                        'ref':
                        str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
                    })
                excluded_list.append({'objid': info[0]})
        # clone the workspace EXCEPT for currentNarrative object + obejcts of DataPalette types:
        newWsId = self.ws.clone_workspace({
            'wsi': {
                'id': workspaceId
            },
            'workspace': newWsName,
            'meta': newWsMeta,
            'exclude': excluded_list
        })[0]
        try:
            if dp_detected:
                self.dps_cache.call_method(
                    "copy_palette", [{
                        'from_workspace': str(workspaceId),
                        'to_workspace': str(newWsId)
                    }], self.token)
            if len(add_to_palette_list) > 0:
                # There are objects in source workspace that have type under DataPalette handling
                # but these objects are physically stored in source workspace rather that saved
                # in DataPalette object. So they weren't copied by "dps.copy_palette".
                self.dps_cache.call_method("add_to_palette",
                                           [{
                                               'workspace': str(newWsId),
                                               'new_refs': add_to_palette_list
                                           }], self.token)

            # update the ref inside the narrative object and the new workspace metadata.
            newNarMetadata = currentNarrative['info'][10]
            newNarMetadata['name'] = newName
            newNarMetadata['ws_name'] = newWsName
            newNarMetadata['job_info'] = json.dumps({
                'queue_time': 0,
                'running': 0,
                'completed': 0,
                'run_time': 0,
                'error': 0
            })

            currentNarrative['data']['metadata']['name'] = newName
            currentNarrative['data']['metadata']['ws_name'] = newWsName
            currentNarrative['data']['metadata']['job_ids'] = {
                'apps': [],
                'methods': [],
                'job_usage': {
                    'queue_time': 0,
                    'run_time': 0
                }
            }
            # save the shiny new Narrative so it's at version 1
            newNarInfo = self.ws.save_objects({
                'id':
                newWsId,
                'objects': [{
                    'type': currentNarrative['info'][2],
                    'data': currentNarrative['data'],
                    'provenance': currentNarrative['provenance'],
                    'name': currentNarrative['info'][1],
                    'meta': newNarMetadata
                }]
            })
            # now, just update the workspace metadata to point
            # to the new narrative object
            newNarId = newNarInfo[0][0]
            self.ws.alter_workspace_metadata({
                'wsi': {
                    'id': newWsId
                },
                'new': {
                    'narrative': str(newNarId)
                }
            })
            return {'newWsId': newWsId, 'newNarId': newNarId}
        except:
            # let's delete copy of workspace so it's out of the way - it's broken
            self.ws.delete_workspace({'id': newWsId})
            raise  # continue raising previous exception

    def create_new_narrative(self, app, method, appparam, appData, markdown,
                             copydata, importData, includeIntroCell):
        if app and method:
            raise ValueError(
                "Must provide no more than one of the app or method params")

        if (not importData) and copydata:
            importData = copydata.split(';')

        if (not appData) and appparam:
            appData = []
            for tmp_item in appparam.split(';'):
                tmp_tuple = tmp_item.split(',')
                step_pos = None
                if tmp_tuple[0]:
                    try:
                        step_pos = int(tmp_tuple[0])
                    except ValueError:
                        pass
                appData.append([step_pos, tmp_tuple[1], tmp_tuple[2]])
        cells = None
        if app:
            cells = [{"app": app}]
        elif method:
            cells = [{"method": method}]
        elif markdown:
            cells = [{"markdown": markdown}]
        return self._create_temp_narrative(cells, appData, importData,
                                           includeIntroCell)

    def _get_intro_markdown(self):
        """
        Creates and returns a cell with the introductory text included.
        """
        # Load introductory markdown text
        with open(self.intro_md_file) as intro_file:
            intro_md = intro_file.read()
        return intro_md

    def _create_temp_narrative(self, cells, parameters, importData,
                               includeIntroCell):
        # Migration to python of JavaScript class from https://github.com/kbase/kbase-ui/blob/4d31151d13de0278765a69b2b09f3bcf0e832409/src/client/modules/plugins/narrativemanager/modules/narrativeManager.js#L414
        narr_id = int(round(time.time() * 1000))
        workspaceName = self.user_id + ':narrative_' + str(narr_id)
        narrativeName = "Narrative." + str(narr_id)

        ws = self.ws
        ws_info = ws.create_workspace({
            'workspace': workspaceName,
            'description': ''
        })
        newWorkspaceInfo = ServiceUtils.workspaceInfoToObject(ws_info)
        [narrativeObject, metadataExternal
         ] = self._fetchNarrativeObjects(workspaceName, cells, parameters,
                                         includeIntroCell)
        objectInfo = ws.save_objects({
            'workspace':
            workspaceName,
            'objects': [{
                'type':
                'KBaseNarrative.Narrative',
                'data':
                narrativeObject,
                'name':
                narrativeName,
                'meta':
                metadataExternal,
                'provenance': [{
                    'script':
                    'NarrativeManager.py',
                    'description':
                    'Created new ' + 'Workspace/Narrative bundle.'
                }],
                'hidden':
                0
            }]
        })[0]
        objectInfo = ServiceUtils.objectInfoToObject(objectInfo)
        self._completeNewNarrative(newWorkspaceInfo['id'], objectInfo['id'],
                                   importData)
        return {'workspaceInfo': newWorkspaceInfo, 'narrativeInfo': objectInfo}

    def _fetchNarrativeObjects(self, workspaceName, cells, parameters,
                               includeIntroCell):
        if not cells:
            cells = []
        # fetchSpecs
        appSpecIds = []
        methodSpecIds = []
        specMapping = {'apps': {}, 'methods': {}}
        for cell in cells:
            if 'app' in cell:
                appSpecIds.append(cell['app'])
            elif 'method' in cell:
                methodSpecIds.append(cell['method'])
        nms = NarrativeMethodStore(self.narrativeMethodStoreURL,
                                   token=self.token)
        if len(appSpecIds) > 0:
            appSpecs = nms.get_app_spec({'ids': appSpecIds})
            for spec in appSpecs:
                spec_id = spec['info']['id']
                specMapping['apps'][spec_id] = spec
        if len(methodSpecIds) > 0:
            methodSpecs = nms.get_method_spec({'ids': methodSpecIds})
            for spec in methodSpecs:
                spec_id = spec['info']['id']
                specMapping['methods'][spec_id] = spec
        # end of fetchSpecs
        metadata = {
            'job_ids': {
                'methods': [],
                'apps': [],
                'job_usage': {
                    'queue_time': 0,
                    'run_time': 0
                }
            },
            'format': 'ipynb',
            'creator': self.user_id,
            'ws_name': workspaceName,
            'name': 'Untitled',
            'type': 'KBaseNarrative.Narrative',
            'description': '',
            'data_dependencies': []
        }
        cellData = self._gatherCellData(cells, specMapping, parameters,
                                        includeIntroCell)
        narrativeObject = {
            'nbformat_minor': 0,
            'cells': cellData,
            'metadata': metadata,
            'nbformat': 4
        }
        metadataExternal = {}
        for key in metadata:
            value = metadata[key]
            if isinstance(value, basestring):
                metadataExternal[key] = value
            else:
                metadataExternal[key] = json.dumps(value)
        return [narrativeObject, metadataExternal]

    def _gatherCellData(self, cells, specMapping, parameters,
                        includeIntroCell):
        cell_data = []
        if includeIntroCell == 1:
            cell_data.append({
                'cell_type': 'markdown',
                'source': self._get_intro_markdown(),
                'metadata': {}
            })
        for cell_pos, cell in enumerate(cells):
            if 'app' in cell:
                cell_data.append(
                    self._buildAppCell(len(cell_data),
                                       specMapping['apps'][cell['app']],
                                       parameters))
            elif 'method' in cell:
                cell_data.append(
                    self._buildMethodCell(
                        len(cell_data), specMapping['methods'][cell['method']],
                        parameters))
            elif 'markdown' in cell:
                cell_data.append({
                    'cell_type': 'markdown',
                    'source': cell['markdown'],
                    'metadata': {}
                })
            else:
                raise ValueError("cannot add cell #" + str(cell_pos) +
                                 ", unrecognized cell content")
        return cell_data

    def _buildAppCell(self, pos, spec, params):
        cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4())
        cell = {
            'cell_type':
            'markdown',
            'source':
            "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" +
            cellId + "').kbaseNarrativeAppCell({'appSpec' : '" +
            self._safeJSONStringify(spec) + "', 'cellId' : '" + cellId +
            "'});" + "</script>",
            'metadata': {}
        }
        cellInfo = {}
        widgetState = []
        cellInfo[self.KB_TYPE] = self.KB_APP_CELL
        cellInfo['app'] = spec
        if params:
            steps = {}
            for param in params:
                stepid = 'step_' + str(param[0])
                if stepid not in steps:
                    steps[stepid] = {}
                    steps[stepid]['inputState'] = {}
                steps[stepid]['inputState'][param[1]] = param[2]
            state = {
                'state': {
                    'step': steps
                }
            }
            widgetState.append(state)
        cellInfo[self.KB_STATE] = widgetState
        cell['metadata'][self.KB_CELL] = cellInfo
        return cell

    def _buildMethodCell(self, pos, spec, params):
        cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4())
        cell = {
            'cell_type':
            'markdown',
            'source':
            "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" +
            cellId + "').kbaseNarrativeMethodCell({'method' : '" +
            self._safeJSONStringify(spec) + "'});" + "</script>",
            'metadata': {}
        }
        cellInfo = {'method': spec, 'widget': spec['widgets']['input']}
        cellInfo[self.KB_TYPE] = self.KB_FUNCTION_CELL
        widgetState = []
        if params:
            wparams = {}
            for param in params:
                wparams[param[1]] = param[2]
            widgetState.append({'state': wparams})
        cellInfo[self.KB_STATE] = widgetState
        cell['metadata'][self.KB_CELL] = cellInfo
        return cell

    def _completeNewNarrative(self, workspaceId, objectId, importData):
        self.ws.alter_workspace_metadata({
            'wsi': {
                'id': workspaceId
            },
            'new': {
                'narrative': str(objectId),
                'is_temporary': 'true'
            }
        })
        # copy_to_narrative:
        if not importData:
            return
        objectsToCopy = [{'ref': x} for x in importData]
        infoList = self.ws.get_object_info_new({
            'objects': objectsToCopy,
            'includeMetadata': 0
        })
        for item in infoList:
            objectInfo = ServiceUtils.objectInfoToObject(item)
            self.copy_object(objectInfo['ref'], workspaceId, None, None,
                             objectInfo)

    def _safeJSONStringify(self, obj):
        return json.dumps(self._safeJSONStringifyPrepare(obj))

    def _safeJSONStringifyPrepare(self, obj):
        if isinstance(obj, basestring):
            return obj.replace("'", "&apos;").replace('"', "&quot;")
        elif isinstance(obj, list):
            for pos in range(len(obj)):
                obj[pos] = self._safeJSONStringifyPrepare(obj[pos])
        elif isinstance(obj, dict):
            obj_keys = list(obj.keys())
            for key in obj_keys:
                obj[key] = self._safeJSONStringifyPrepare(obj[key])
        else:
            pass  # it's boolean/int/float/None
        return obj

    def _get_workspace_name_or_id(self, ws_id, ws_name):
        ret = ws_name
        if not ret:
            ret = str(ws_id)
        return ret

    def copy_object(self, ref, target_ws_id, target_ws_name, target_name,
                    src_info):
        # There should be some logic related to DataPalettes
        if (not target_ws_id) and (not target_ws_name):
            raise ValueError("Neither target workspace ID nor name is defined")
        if not src_info:
            src_info_tuple = self.ws.get_object_info_new({
                'objects': [{
                    'ref': ref
                }],
                'includeMetadata':
                0
            })[0]
            src_info = ServiceUtils.objectInfoToObject(src_info_tuple)
        type_name = src_info['typeModule'] + '.' + src_info['typeName']
        type_config = self.DATA_PALETTES_TYPES.get(type_name)
        if type_config is not None:
            # Copy with DataPaletteService
            if target_name:
                raise ValueError(
                    "'target_name' cannot be defined for DataPalette copy")
            target_ws_name_or_id = self._get_workspace_name_or_id(
                target_ws_id, target_ws_name)
            self.dps_cache.call_method("add_to_palette",
                                       [{
                                           'workspace': target_ws_name_or_id,
                                           'new_refs': [{
                                               'ref': ref
                                           }]
                                       }], self.token)
            return {'info': src_info}
        else:
            if not target_name:
                target_name = src_info['name']
            obj_info_tuple = self.ws.copy_object({
                'from': {
                    'ref': ref
                },
                'to': {
                    'wsid': target_ws_id,
                    'workspace': target_ws_name,
                    'name': target_name
                }
            })
            obj_info = ServiceUtils.objectInfoToObject(obj_info_tuple)
            return {'info': obj_info}

    def list_available_types(self, workspaces):
        data = self.list_objects_with_sets(workspaces=workspaces)['data']
        type_stat = {}
        for item in data:
            info = item['object_info']
            obj_type = info[2].split('-')[0]
            if obj_type in type_stat:
                type_stat[obj_type] += 1
            else:
                type_stat[obj_type] = 1
        return {'type_stat': type_stat}
Beispiel #2
0
class BallgownUtil:

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.deu = DifferentialExpressionUtils(self.callback_url, service_ver='dev')
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']
        self.config = config

    def _xor(self, a, b):
        return bool(a) != bool(b)

    def _validate_run_ballgown_app_params(self, params):
        """
        _validate_run_ballgown_app_params:
                validates params passed to run_ballgown_app method
        """

        log('start validating run_ballgown_app params')

        # check for required parameters
        for p in ['expressionset_ref', 'diff_expression_matrix_set_suffix',
                  'workspace_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        run_all_combinations = params.get('run_all_combinations')
        condition_pair_subset = params.get('condition_pair_subset')

        if not self._xor(run_all_combinations, condition_pair_subset):
            error_msg = "Invalid input:\nselect 'Run All Paired Condition Combinations' "
            error_msg += "or provide subset of condition pairs. Don't provide both, or neither."
            raise ValueError(error_msg)

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_report(self, result_directory, params, diff_expression_matrix_set_ref):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        for file in glob.glob(os.path.join(result_directory, '*.tsv')):
            shutil.copy(file, output_directory)

        # volcano_plot exists only if there are two condition groups
        for file in glob.glob(os.path.join(result_directory, '*.png')):
            shutil.copy(file, output_directory)

        diff_expr_set = self.ws.get_objects2({'objects':
                                              [{'ref':
                                                diff_expression_matrix_set_ref[
                                                    'diffExprMatrixSet_ref']}]})['data'][0]
        diff_expr_set_data = diff_expr_set['data']
        diff_expr_set_info = diff_expr_set['info']
        diff_expr_set_name = diff_expr_set_info[1]

        overview_content = ''
        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrixSet'
        overview_content += ' Object</th></tr>'
        overview_content += '<tr><td>{} ({})'.format(diff_expr_set_name,
                                                     diff_expression_matrix_set_ref[
                                                         'diffExprMatrixSet_ref'])
        overview_content += '</td></tr></table>'

        overview_content += '<p><br/></p>'

        overview_content += '<br/><table><tr><th>Generated DifferentialExpressionMatrix'
        overview_content += ' Object</th><th></th><th></th><th></th></tr>'
        overview_content += '<tr><th>Differential Expression Matrix Name</th>'
        overview_content += '<th>Condition 1</th>'
        overview_content += '<th>Condition 2</th>'
        overview_content += '</tr>'

        for item in diff_expr_set_data['items']:
            item_diffexprmatrix_object = self.ws.get_objects2({'objects':
                                                               [{'ref': item['ref']}]})[
                'data'][0]
            item_diffexprmatrix_info = item_diffexprmatrix_object['info']
            item_diffexprmatrix_data = item_diffexprmatrix_object['data']
            diffexprmatrix_name = item_diffexprmatrix_info[1]

            overview_content += '<tr><td>{} ({})</td>'.format(diffexprmatrix_name,
                                                              item['ref'])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').keys()[0])
            overview_content += '<td>{}</td>'.format(item_diffexprmatrix_data.
                                                     get('condition_mapping').values()[0])
            overview_content += '</tr>'
        overview_content += '</table>'

        # visualization
        image_content = ''
        for image in glob.glob(output_directory + "/*.png"):
            image = image.replace(output_directory + '/', '')
            caption = image.replace(output_directory + '/', '').replace('.png', '')
            image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                             'width="600" height="400"></a><a target="_blank"><br>' \
                             '<p align="center">{}</p></p>'.format(
                                 image, caption)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('<p>Image Gallery</p>',
                                                          image_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Ballgown App'})
        return html_report

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'ballgown_result.zip')

        with zipfile.ZipFile(result_file, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.zip') or
                            file.endswith('.png') or
                            file.endswith('.DS_Store')):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({'path': result_file,
                             'name': os.path.basename(result_file),
                             'label': os.path.basename(result_file),
                             'description': 'File(s) generated by Ballgown App'})

        return output_files

    def _generate_report(self, params, result_directory, diff_expression_matrix_set_ref):
        """
        _generate_report: generate summary report
        """
        log('creating report')

        output_files = self._generate_output_file_list(result_directory)

        output_html_files = self._generate_html_report(
            result_directory, params, diff_expression_matrix_set_ref)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'kb_ballgown_report_' + str(uuid.uuid4())}

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def get_sample_dir_group_file(self, mapped_expression_ids, condition_labels):

        ngroups = 0
        group_name_indices = {}
        group_counts = {}

        for group in condition_labels:
            if not group in group_name_indices:
                group_name_indices[group] = ngroups
                ngroups = ngroups + 1
            if not group in group_counts:
                group_counts[group] = 1
            else:
                group_counts[group] = group_counts[group] + 1

        # checks for proper ballgown execution:
        if ngroups < 2:
            raise Exception("At least two condition groups are needed for this analysis. ")
        for group in condition_labels:
            if group_counts[group] < 2:
                raise Exception(
                    "Condition group {0} has less than 2 members; ballgown will not run. "
                    "At least two condition groups are needed for this analysis. ".format(group))

        group_file_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(group_file_dir)

        try:
            condition_labels_uniqued = list(set(condition_labels))
            sgf_name = os.path.join(group_file_dir, 'sample_dir_group_file_' +
                                    condition_labels_uniqued[0] + '_' +
                                    condition_labels_uniqued[1])
            sgf = open(sgf_name, "w")
        except Exception:
            raise Exception(
                "Can't open file {0} for writing {1}".format(
                    sgf_name, traceback.format_exc()))

        index = 0  # condition label index
        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                handle_id = expression_object['data']['file']['hid']
                expression_name = expression_object['info'][1]

                expression_dir = os.path.join(group_file_dir, expression_name)
                self._mkdir_p(expression_dir)

                print('expression_name: ' + str(expression_dir) + ' ' +
                      str(group_name_indices[condition_labels[index]]))
                sgf.write("{0}  {1}\n".format(expression_dir,
                                              group_name_indices[condition_labels[index]]))

                self.dfu.shock_to_file({'handle_id': handle_id,
                                        'file_path': expression_dir,
                                        'unpack': 'unpack'})

                required_files = [
                    'e2t.ctab',
                    'e_data.ctab',
                    'i2t.ctab',
                    'i_data.ctab',
                    't_data.ctab']
                for file in glob.glob(expression_dir + '/*'):
                    if not os.path.basename(file) in required_files:
                        os.remove(file)

            index += 1

        return sgf_name

    def _cleanup(self, directory=None):
        """
        Clean up after the job.  At the moment this just means removing the working
        directory, but later could mean other things.
        """

        try:
            # it would not delete if fold is not empty
            shutil.rmtree(directory, ignore_errors=True)
            # need to iterate each entry
        except IOError as e:
            log("Unable to remove working directory {0}".format(directory))
            raise

    def _setupWorkingDir(self, directory=None):
        """
        Clean up an existing workingdir and create a new one
        """
        try:
            if os.path.exists(directory):
                self._cleanup(directory)
            os.mkdir(directory)
        except IOError:
            log("Unable to setup working dir {0}".format(directory))
            raise

    def _check_intron_measurements(self, sample_dir_group_table_file):
        """
        Check if intron measurements files are non-empty
        :param sample_dir_group_table_file:
        :return:
        """
        log('checking for intron level measurements... ')
        file = open(sample_dir_group_table_file, 'r')
        textFileLines = file.readlines()
        for line in textFileLines:
            expr_dir = line.split()[0]
            log(expr_dir)
            i2t_file = open(os.path.join(expr_dir, 'i2t.ctab'), 'r')
            if len(i2t_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown.")
            idata_file = open(os.path.join(expr_dir, 'i_data.ctab'), 'r')
            if len(idata_file.readlines()) <= 1:  # only header line exists
                raise Exception("No intron measurements found! Input expressions are possibly "
                                "from a prokaryote. Ballgown functions only on eukaryotic data."
                                " Consider using DeSeq2 or CuffDiff instead of BallGown")

    def run_ballgown_diff_exp(self,
                              rscripts_dir,
                              sample_dir_group_table_file,
                              ballgown_output_dir,
                              output_csv,
                              volcano_plot_file
                              ):
        """ Make R call to execute the system

        :param rscripts_dir:
        :param sample_dir_group_table_file:

        :param ballgown_output_dir:
          sample_group_table is a listing of output Stringtie subdirectories,
         (full path specification) paired with group label (0 or 1), ie
            /path/WT_rep1_stringtie    0
            /path/WT_rep2_stringtie    0
            /path/EXP_rep1_stringtie   1
            /path/EXP_rep2_stringtie   1
          (order doesn't matter, but the directory-group correspondance does)

        :param output_csv:
        :param volcano_plot_file:
        :return:
        """
        # check if intron-level expression measurements are present
        self._check_intron_measurements(sample_dir_group_table_file)

        rcmd_list = ['Rscript', os.path.join(rscripts_dir, 'ballgown_fpkmgenematrix.R'),
                     '--sample_dir_group_table', sample_dir_group_table_file,
                     '--output_dir', ballgown_output_dir,
                     '--output_csvfile', output_csv,
                     '--volcano_plot_file', volcano_plot_file
                     ]
        rcmd_str = " ".join(str(x) for x in rcmd_list)
        log("rcmd_string is {0}".format(rcmd_str))
        openedprocess = subprocess.Popen(rcmd_str, shell=True)
        openedprocess.wait()
        # Make sure the openedprocess.returncode is zero (0)
        if openedprocess.returncode != 0:
            log("R script did not return normally, return code - "
                + str(openedprocess.returncode))
            raise Exception("Rscript failure")

    def load_diff_expr_matrix(self, ballgown_output_dir, output_csv):
        """
        Reads csv diff expr matrix file from Ballgown and returns as a
        dictionary of rows with the gene as key.  Each key gives a row of
        length three corresponding to fold_change, pval and qval in string form
        - can include 'NA'
        :param ballgown_output_dir
        :param output_csv:
        :return:
        """

        diff_matrix_file = os.path.join(ballgown_output_dir, output_csv)

        if not os.path.isfile(diff_matrix_file):
            raise Exception("differential expression matrix csvfile {0} doesn't exist!".format(
                diff_matrix_file))

        n = 0
        dm = {}
        with open(diff_matrix_file, "r") as csv_file:
            csv_rows = csv.reader(csv_file, delimiter="\t", quotechar='"')
            for row in csv_rows:
                n = n + 1
                if (n == 1):
                    if (row != ['id', 'fc', 'pval', 'qval']):
                        raise Exception(
                            "did not get expected column heading from {0}".format(
                                diff_matrix_file))
                else:
                    if (len(row) != 4):
                        raise Exception(
                            "did not get 4 elements in row {0} of csv file {1} ".format(
                                n, diff_matrix_file))
                    key = row[0]
                    # put in checks for NA or numeric for row[1] through 4
                    if (key in dm):
                        raise Exception(
                            "duplicate key {0} in row {1} of csv file {2} ".format(
                                key, n, diff_matrix_file))
                    dm[key] = row[1:5]

        return dm

    def _transform_expression_set_data(self, expression_set_data):
        """
        The stitch to connect KBaseSets.ExpressionSet-2.0 type data to
        the older KBaseRNASeq.RNASeqExpressionSet-3.0 that the implementation
        depends on. This is done by doing a dive into the nested alignment
        object ref and getting the required data
        :param expression_set_data:
        :return: transformed expression_set_data
        """
        transform = dict()
        # get genome id
        expression_ref = expression_set_data['items'][0]['ref']
        wsid, objid, ver = expression_ref.split('/')
        expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        transform['genome_id'] = expression_obj[0]['data']['genome_id']

        # get sampleset_id
        #alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
        #wsid, objid, ver = alignment_ref.split('/')
        #alignment_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
        #transform['sampleset_id'] = alignment_obj[0]['data']['sampleset_id']

        # build mapped_expression_ids
        mapped_expression_ids = list()
        for item in expression_set_data['items']:
            expression_ref = item['ref']
            wsid, objid, ver = expression_ref.split('/')
            expression_obj = self.ws.get_objects([{'objid': objid, 'wsid': wsid}])
            alignment_ref = expression_obj[0]['data']['mapped_rnaseq_alignment'].values()[0]
            mapped_expression_ids.append({alignment_ref: expression_ref})
        transform['mapped_expression_ids'] = mapped_expression_ids

        return transform

    def _build_condition_label_list(self, mapped_expression_ids):
        """
        Extracts the condition labels from each expression in the specified expression set data
        and builds a list of condition labels
        :param expression_set_data: expression set data
        :return: list of condition labels whose order resembles the expression order in
        the expression data
        """
        condition_labels = list()

        for ii in mapped_expression_ids:
            for alignment_id, expression_id in ii.items():
                expression_object = self.ws.get_objects2(
                    {'objects':
                     [{'ref': expression_id}]})['data'][0]
                condition_labels.append(expression_object['data']['condition'])

        return condition_labels

    def _update_output_file_header(self, output_file):
        """
        Modify header of output file (required by DifferentialExpressionUtils)
        :param output_file:
        :return:
        """
        f = open(output_file, 'r')
        filedata = f.read()
        f.close()

        modified_output = filedata.replace(
            '"id"\t"fc"\t"pval"\t"qval"',
            'gene_id\tlog2_fold_change\tp_value\tq_value')

        f = open(output_file, 'w')
        f.write(modified_output)
        f.close()

    def _check_input_labels(self, condition_pair_subset, available_condition_labels):
        """
        _check_input_labels: check input condition pairs
        """
        checked = True
        # example struct: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
        condition_values = set()
        for condition in condition_pair_subset:
            condition_values.add(condition['condition'])

        if len(condition_values) < 2:
            error_msg = 'At least two unique conditions must be specified. '
            raise ValueError(error_msg)

        for condition in condition_pair_subset:

            label = condition['condition'].strip()
            if label not in available_condition_labels:
                error_msg = 'Condition label "{}" is not a valid condition. '.format(label)
                error_msg += 'Must be one of "{}"'.format(available_condition_labels)
                raise ValueError(error_msg)

        return checked

    def run_ballgown_app(self, params):
        """
        run_ballgown_app: run Ballgown app
        (https://www.bioconductor.org/packages/release/bioc/html/ballgown.html)

        required params:
            expressionset_ref: ExpressionSet object reference
            diff_expression_matrix_set_suffix: suffix to KBaseSets.DifferetialExpressionMatrixSet
            name
            condition_labels: conditions for expression set object
            alpha_cutoff: q value cutoff
            fold_change_cutoff: fold change cutoff
            workspace_name: the name of the workspace it gets saved to

        optional params:
            fold_scale_type: one of ["linear", "log2+1", "log10+1"]

        return:
            result_directory: folder path that holds all files generated by run_deseq2_app
            diff_expression_matrix_set_ref: generated KBaseSets.DifferetialExpressionMatrixSet
            object reference
            report_name: report name generated by KBaseReport
            report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning BallgownUtil.run_ballgown_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_ballgown_app_params(params)

        expressionset_ref = params.get('expressionset_ref')

        expression_set_info = self.ws.get_object_info3({
            "objects": [{"ref": expressionset_ref}]})['infos'][0]
        expression_object_type = expression_set_info[2]

        # set output object name
        differential_expression_suffix = params['diff_expression_matrix_set_suffix']
        expression_name = expression_set_info[1]
        if re.match('.*_[Ee]xpression$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression$', differential_expression_suffix, expression_name)
        if re.match('.*_[Ee]xpression_[Ss]et$', expression_name):
            params['diff_expression_matrix_set_name'] = re.sub(
                '_[Ee]xpression_[Ss]et$', differential_expression_suffix, expression_name)
        else:
            params['diff_expression_matrix_set_name'] = expression_name + \
                differential_expression_suffix

        log('--->\nexpression object type: \n' +
            '{}'.format(expression_object_type))

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            expression_set_data = self.ws.get_objects2(
                {'objects':
                 [{'ref': expressionset_ref}]})['data'][0]['data']

            expression_set_data = self._transform_expression_set_data(expression_set_data)

        mgroup = MultiGroup(self.ws)
        pairwise_mapped_expression_ids = mgroup.build_pairwise_groups(
            expression_set_data['mapped_expression_ids'])

        ballgown_output_dir = os.path.join(self.scratch, "ballgown_out")
        log("ballgown output dir is {0}".format(ballgown_output_dir))
        self._setupWorkingDir(ballgown_output_dir)

        # get set of all condition labels
        available_condition_labels = \
            self._build_condition_label_list(expression_set_data['mapped_expression_ids'])

        if params.get('run_all_combinations'):
            requested_condition_labels = available_condition_labels
        else:
            # get set of user specified condition labels
            condition_pair_subset = params.get('condition_pair_subset')
            if self._check_input_labels(condition_pair_subset, available_condition_labels):
                requested_condition_labels = list()
                # example: [{u'condition': u'hy5'}, {u'condition': u'WT'}]
                for condition in condition_pair_subset:
                    if condition.get('condition').strip() not in requested_condition_labels:
                        requested_condition_labels.append(condition.get('condition').strip())

        log("User requested pairwise combinations from condition label list : " +
            str(requested_condition_labels))

        diff_expr_files = list()

        for mapped_expression_ids in pairwise_mapped_expression_ids:
            print('processing pairwise combination: ')
            pprint(mapped_expression_ids)
            print('with condtion labels: ')
            condition_labels = self._build_condition_label_list(mapped_expression_ids)
            pprint(condition_labels)

            # skip if condition labels in this pairwise combination don't exist in
            # set of user requested condition labels
            skip = False
            for condition in condition_labels:
                if condition not in requested_condition_labels:
                    log("skipping " + str(condition_labels))
                    skip = True
            if skip:
                continue

            sample_dir_group_file = self.get_sample_dir_group_file(mapped_expression_ids,
                                                                   condition_labels)

            log("about to run_ballgown_diff_exp")
            rscripts_dir = '/kb/module/rscripts'

            condition_labels_uniqued = list()
            for condition in condition_labels:
                if condition not in condition_labels_uniqued:
                    condition_labels_uniqued.append(condition)

            output_csv = 'ballgown_diffexp_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.tsv'
            volcano_plot_file = 'volcano_plot_' + \
                condition_labels_uniqued[0] + '_' + condition_labels_uniqued[1] + '.png'

            self.run_ballgown_diff_exp(rscripts_dir,
                                       sample_dir_group_file,
                                       ballgown_output_dir,
                                       output_csv,
                                       volcano_plot_file)

            log("back from run_ballgown_diff_exp, about to load diff exp matrix file")
            # diff_expr_matrix = self.load_diff_expr_matrix(ballgown_output_dir,
            # output_csv)  # read file before its zipped

            self._update_output_file_header(os.path.join(ballgown_output_dir, output_csv))

            diff_expr_file = dict()
            diff_expr_file.update({'condition_mapping':
                                   {condition_labels_uniqued[0]: condition_labels_uniqued[1]}})
            diff_expr_file.update(
                {'diffexpr_filepath': os.path.join(ballgown_output_dir, output_csv)})
            diff_expr_files.append(diff_expr_file)

        deu_param = {
            'destination_ref': params['workspace_name'] + '/' +
            params['diff_expression_matrix_set_name'],
            'diffexpr_data': diff_expr_files,
            'tool_used': TOOL_NAME,
            'tool_version': TOOL_VERSION,
            'genome_ref': expression_set_data.get('genome_id'),
        }

        diff_expression_matrix_set_ref = self.deu.save_differential_expression_matrix_set(
            deu_param)

        returnVal = {'result_directory': ballgown_output_dir,
                     'diff_expression_matrix_set_ref':
                         diff_expression_matrix_set_ref['diffExprMatrixSet_ref']}

        report_output = self._generate_report(params,
                                              ballgown_output_dir, diff_expression_matrix_set_ref)
        returnVal.update(report_output)

        return returnVal
Beispiel #3
0
class MutualInfoUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_flux_mutual_information_analysis_params(self, params):
        """
        _validate_run_flux_mutual_information_analysis_params:
                validates params passed to run_flux_mutual_information_analysis method
        """

        log('start validating validate_run_flux_mutual_information_analysis params'
            )

        # check for required parameters
        for p in ['fbamodel_id', 'compounds', 'media_id', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _get_file_from_ws(self, workspace, obj_name):
        try:
            file_path = self.ws.get_objects([{
                'name': obj_name,
                'workspace': workspace
            }])[0]
        except Exception as e:
            raise ValueError('Unable to get object from workspace: (' +
                             workspace + '/' + obj_name + ')' + str(e))
        return file_path

    def _make_media_files(self, ws_name, base, compounds):
        """
        Build and store media objects for each combination of compound added to the base media.
        :param base: The base media file
        :param compounds: the set of compound to test
        :return: A list of media ids and a matrix with each media combination defined
        """
        base_media = self._get_file_from_ws(ws_name, base)['data']

        media_ids = [base_media['id']]
        new_media_list = []
        media_matrix = [[""] + compounds]
        media_matrix.append([base_media['id'] + [0] * len(compounds)])
        for n_comp in range(1, len(compounds) + 1):
            for combo in combinations(compounds, n_comp):
                new_media_id = base_media['id'] + '_v%s' % len(media_matrix)
                media_ids.append(new_media_id)
                media_matrix.append(
                    [new_media_id] +
                    [1 if comp in combo else 0 for comp in compounds])
                new_media = deepcopy(base_media)
                new_media['id'] = new_media_id
                new_media['name'] = new_media_id
                for new_comp in combo:
                    new_media['mediacompounds'].append({
                        'compound_ref':
                        '48/1/1/compounds/id/%s' % new_comp.split('_')[0],
                        'concentration':
                        1.0,
                        'maxFlux':
                        1000,
                        'minFlux':
                        -1000
                    })
                new_media_list.append(new_media)

        print("Made %s Media Files" % len(media_ids) - 1)
        info = self.ws.save_objects({
            'workspace':
            ws_name,
            "objects": [{
                "type": "KBaseBiochem.Media",
                "data": media,
                "name": media['name']
            } for media in new_media_list]
        })
        print info
        return media_ids, media_matrix

    def _generate_html_report(self, result_directory, mutual_info_dict):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'mutual_information_report.html')

        shutil.copy(os.path.join(result_directory, 'MI_plot.png'),
                    os.path.join(output_directory, 'MI_plot.png'))

        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'
        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Mutual Information App'
        })

        return html_report

    def _generate_report(self, result_directory, mutual_info_dict, params):
        """
        _generate_report: generate summary report
        """

        log('creating report')
        output_html_files = self._generate_html_report(result_directory,
                                                       mutual_info_dict)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name':
            'MutualInfomation_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_mutual_info(self, media_matrix, fba_file):

        df1 = pd.read_csv(fba_file)
        df1.as_matrix()

        #----Input validation of Media/FBAs with Binary Matrix FBAs------
        # 1.0 Number of rows in Media.csv file =  (Number of columns -1)
        #   1.0. If they are different: Through an ERROR saying missed match number of FBAs in media and binary matrix.
        # 1.1 Check whether the elements in Media.csv file contains only binary values (i.e. 0 and 1)
        #   1.1. If the elements are different: Through an ERROR saying not approapriate input values
        # 1.2 Check whether the compounds in Media.csv file match with number of FBAs
        #   1.2. If the compounds are different from number of FBAs: Through an ERROR saying not appropriate input values

        s_df1 = df1.shape
        s_df2 = media_matrix.shape

        Temp_df2 = np.array(media_matrix.values)
        # Create matrix with only the elements remove first column and all the rows
        Temp_df2 = Temp_df2[0:, 1:]

        Bin_val_check = np.array_equal(Temp_df2, Temp_df2.astype(bool))
        num_compounds = (s_df2[1]) - 1

        if ((s_df1[1] - 1) != s_df2[0]) or (Bin_val_check != True) or (int(
                math.log(s_df2[0], 2)) != num_compounds):
            print('invalid input values')

        #-----All possible combination of the chemical compounds----------------------
        # 2.0 Sperating m0 from rest of the lables

        Temp1_df2 = media_matrix

        cols = Temp1_df2.columns
        for i in range(1, len(cols)):
            Temp1_df2.loc[Temp1_df2[cols[i]] == 1, cols[i]] = cols[i]

        print Temp1_df2

        # 2.1 Creating a disctionary for all FBAs except m0
        print len(Temp1_df2)
        mydict = {}
        for x in range(0, len(Temp1_df2)):
            for i in range(1, s_df2[1]):
                currentvalue = Temp1_df2.iloc[x, i]
                currentid = Temp1_df2.iloc[x, 0]
                currentvalue = Temp1_df2.iloc[x, i]
                mydict.setdefault(currentid, [])
                if currentvalue > 0:
                    mydict[currentid].append(currentvalue)

        # Add the first key as m0
        media_0_name = 'm0'
        mydict[media_0_name] = "['0']"
        #Sort the keys
        mydict = collections.OrderedDict(natsort.natsorted(mydict.items()))
        print mydict

        for k, v in mydict.iteritems():
            print k, v

        # List of Compounds combination in the list
        my_combi_list = []
        Compounds_Combi = list(range(1, num_compounds + 1))
        for L in range(0, len(Compounds_Combi) + 1):
            for subset in itertools.combinations(Compounds_Combi, L):
                my_combi_list.append(list(subset))
        print my_combi_list

        # Created a dictionary where the keys:
        # list of compounds combination
        # values are corresponding FBAs list in df2
        result_dict = {}
        for element in my_combi_list[1:]:
            for k, v in mydict.iteritems():
                if set(v).issubset(set(map(lambda x: str(x), element))):
                    key = ','.join(map(lambda x: str(x), element))
                    if result_dict.get(key):
                        media_list = result_dict[key]
                        media_list.append(k)
                        media_list = list(set(media_list))
                        result_dict.update({key: media_list})
                    else:
                        result_dict.update({key: [media_0_name, k]})
        print result_dict

        # Created a dictionary where the keys are:
        # list of compounds combination
        # values are compounds combination FBAs with df1 vaules
        All_Comp_Combi_dic = {}
        for column, value in result_dict.items():
            All_Comp_Combi_dic.update({column: df1.get(value)})

        #To print an item from the All_Comp_Combi_dic
        df = (pd.DataFrame(All_Comp_Combi_dic.items()))

        #print df[0]
        #print df[1][7]

        MI_dict = {}
        for k in range(0, len(df[0])):
            drop_rows_df = df[1][k].drop_duplicates(keep="first")
            drop_columns_df = drop_rows_df.T.drop_duplicates(keep="first").T
            remove = []
            removed = {}
            cols = df[1][k].columns
            for i in range(len(cols) - 1):
                duplicated = []
                v = df[1][k][cols[i]].values
                for j in range(i + 1, len(cols)):
                    if np.array_equal(v, df[1][k][cols[j]].values):
                        remove.append(cols[j])
                        duplicated.append(cols[j])
                if duplicated and cols[i] not in remove:
                    removed.update({cols[i]: duplicated})
                count = {}
                for key, value in removed.items():
                    count.update({key: len(value)})

                #print v

                # print drop_columns_df
                values = count.values()
                # print values
                values = map(lambda x: x + 1, values)
                # print values
                d = {x: values.count(x) for x in values}

                #-------Mutual Inforamtion (MI) calculation-------------
                FBAs = len(df[1][k].columns)
                pure_entropy = math.log(FBAs, 2)
                #print pure_entropy

                # If No duplicates exist and list "value" is empty
                if not values:
                    #print("List is empty")
                    No_duplicate_FBAs = len(drop_columns_df.columns)
                    conditional_entropy = -1 * (No_duplicate_FBAs * (
                        (1 / No_duplicate_FBAs) *
                        ((1 / 1) * math.log(1.0 / 1.0, 2))))
                    Mutual_Info = pure_entropy - conditional_entropy
                    #print('Mutaul Info:', Mutual_Info)

                if values:
                    # If duplicates exist and list "value" is not empty
                    conditional_entropy = 0
                    for key in d:
                        #print key, d[key]
                        Temp = -1 * d[key] * (key / float(FBAs)) * key * (
                            1.0 / key) * math.log(1.0 / key, 2)
                        conditional_entropy = Temp + conditional_entropy
                    #print "%3f" %Temp
                    Mutual_Info = pure_entropy - conditional_entropy

                MI_dict[df[0][k]] = Mutual_Info

        #Sorted MI_dict
        MI_dict = sorted(MI_dict.items(), key=lambda x: (-len(x[0]), x[0]))
        MI_dict = OrderedDict(MI_dict)

        print("Plot MI_dict")
        plt.bar(range(len(MI_dict)),
                MI_dict.values(),
                align='center',
                alpha=0.5,
                width=0.7)
        plt.xticks(range(len(MI_dict)), MI_dict.keys(), rotation='vertical')
        plt.xlabel('Compund Combinations')
        plt.ylabel('Mutual Information (in Bits)')
        plt.title("Organism:XYZ")
        fig1 = plt.gcf()
        fig1.savefig(os.path.join(self.scratch, 'MI_plot.png'), dpi=100)

        return MI_dict
Beispiel #4
0
class FunctionalEnrichmentUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_fe1_params(self, params):
        """
        _validate_run_fe1_params:
                validates params passed to run_fe1 method
        """

        log('start validating run_fe1 params')

        # check for required parameters
        for p in ['feature_set_ref', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _generate_report(self, enrichment_map, result_directory,
                         workspace_name, feature_id_go_id_list_map,
                         feature_set_ids, genome_ref, go_id_parent_ids_map,
                         feature_ids):
        """
        _generate_report: generate summary report
        """

        log('start creating report')

        output_files = self._generate_output_file_list(
            result_directory, enrichment_map, feature_id_go_id_list_map,
            feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids)

        output_html_files = self._generate_html_report(result_directory,
                                                       enrichment_map)

        report_object_name = 'kb_functional_enrichment_1_report_' + str(
            uuid.uuid4())
        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': report_object_name
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_supporting_files(self, result_directory, enrichment_map,
                                   feature_id_go_id_list_map, feature_set_ids,
                                   genome_ref, go_id_parent_ids_map,
                                   feature_ids):
        """
        _generate_supporting_files: generate varies debug files
        """
        supporting_files = list()

        feature_id_go_ids_map_file = os.path.join(result_directory,
                                                  'feature_id_go_ids_map.txt')
        go_id_genome_feature_ids_map_file = os.path.join(
            result_directory, 'go_id_genome_feature_ids_map.txt')
        go_id_set_feature_ids_map_file = os.path.join(
            result_directory, 'go_id_feature_set_feature_ids_map.txt')
        feature_ids_file = os.path.join(result_directory, 'feature_ids.txt')
        feature_set_ids_file = os.path.join(result_directory,
                                            'feature_set_ids.txt')
        fisher_variables_file = os.path.join(result_directory,
                                             'fisher_variables.txt')
        genome_info_file = os.path.join(result_directory, 'genome_info.txt')
        go_id_parent_ids_map_file = os.path.join(result_directory,
                                                 'go_id_parent_ids_map.txt')

        supporting_files.append(feature_id_go_ids_map_file)
        supporting_files.append(go_id_genome_feature_ids_map_file)
        supporting_files.append(feature_ids_file)
        supporting_files.append(feature_set_ids_file)
        supporting_files.append(fisher_variables_file)
        supporting_files.append(genome_info_file)
        supporting_files.append(go_id_parent_ids_map_file)
        supporting_files.append(go_id_set_feature_ids_map_file)

        total_feature_ids = feature_id_go_id_list_map.keys()
        feature_ids_with_feature = []
        for feature_id, go_ids in feature_id_go_id_list_map.iteritems():
            if isinstance(go_ids, list):
                feature_ids_with_feature.append(feature_id)
        genome_name = self.ws.get_object_info3(
            {'objects': [{
                'ref': genome_ref
            }]})['infos'][0][1]

        with open(go_id_parent_ids_map_file,
                  'wb') as go_id_parent_ids_map_file:
            for go_id, parent_ids in go_id_parent_ids_map.iteritems():
                go_id_parent_ids_map_file.write('{}: {}\n'.format(
                    go_id, ', '.join(parent_ids)))

        with open(genome_info_file, 'wb') as genome_info_file:
            genome_info_file.write('genome_name: {}\n'.format(genome_name))
            genome_info_file.write('features: {}\n'.format(
                len(total_feature_ids)))
            genome_info_file.write('features with term: {}'.format(
                len(feature_ids_with_feature)))

        with open(feature_set_ids_file, 'wb') as feature_set_ids_file:
            feature_set_ids_file.write('\n'.join(feature_set_ids))

        with open(feature_id_go_ids_map_file,
                  'wb') as feature_id_go_ids_map_file:
            with open(feature_ids_file, 'wb') as feature_ids_file:
                for feature_id, go_ids in feature_id_go_id_list_map.iteritems(
                ):
                    feature_ids_file.write('{} {}\n'.format(
                        feature_id, feature_id in feature_set_ids))
                    if isinstance(go_ids, str):
                        feature_id_go_ids_map_file.write('{} {}\n'.format(
                            feature_id, go_ids))
                    else:
                        feature_id_go_ids_map_file.write('{} {}\n'.format(
                            feature_id, ', '.join(go_ids)))

        with open(go_id_genome_feature_ids_map_file,
                  'wb') as go_id_genome_feature_ids_map_file:
            with open(go_id_set_feature_ids_map_file,
                      'wb') as go_id_set_feature_ids_map_file:
                with open(fisher_variables_file,
                          'wb') as fisher_variables_file:
                    for go_id, go_info in enrichment_map.iteritems():
                        mapped_features = go_info.get('mapped_features')
                        fs_mapped_features = list(
                            set(mapped_features).intersection(feature_set_ids))
                        mapped_features_line = '{}: {}\n'.format(
                            go_id, ', '.join(mapped_features))
                        go_id_genome_feature_ids_map_file.write(
                            mapped_features_line)

                        set_mapped_features_line = '{}: {}\n'.format(
                            go_id, ', '.join(fs_mapped_features))
                        go_id_set_feature_ids_map_file.write(
                            set_mapped_features_line)
                        a_value = go_info.get('num_in_subset_feature_set')
                        b_value = len(feature_set_ids) - a_value
                        c_value = len(mapped_features) - a_value
                        d_value = len(feature_ids) - len(
                            feature_set_ids) - c_value
                        p_value = go_info.get('raw_p_value')
                        fisher_variables_file.write(
                            '{} a:{} b:{} c:{} d:{} '.format(
                                go_id, a_value, b_value, c_value, d_value))
                        fisher_variables_file.write(
                            'p_value:{}\n'.format(p_value))

        result_file = os.path.join(result_directory, 'supporting_files.zip')
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for supporting_file in supporting_files:
                zip_file.write(supporting_file,
                               os.path.basename(supporting_file))

        return [{
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'GO term functional enrichment supporting files'
        }]

    def _generate_output_file_list(self, result_directory, enrichment_map,
                                   feature_id_go_id_list_map, feature_set_ids,
                                   genome_ref, go_id_parent_ids_map,
                                   feature_ids):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """

        log('start packing result files')
        output_files = list()

        result_file = os.path.join(result_directory,
                                   'functional_enrichment.csv')
        with open(result_file, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow([
                'term_id', 'term', 'ontology', 'num_in_feature_set',
                'num_in_ref_genome', 'raw_p_value', 'adjusted_p_value'
            ])
            for key, value in enrichment_map.iteritems():
                writer.writerow([
                    key, value['go_term'], value['namespace'],
                    value['num_in_subset_feature_set'],
                    value['num_in_ref_genome'], value['raw_p_value'],
                    value['adjusted_p_value']
                ])

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'GO term functional enrichment'
        })

        supporting_files = self._generate_supporting_files(
            result_directory, enrichment_map, feature_id_go_id_list_map,
            feature_set_ids, genome_ref, go_id_parent_ids_map, feature_ids)
        output_files += supporting_files

        return output_files

    def _generate_html_report(self, result_directory, enrichment_map):
        """
        _generate_html_report: generate html summary report
        """

        log('start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        enrichment_table = ''
        data = csv.DictReader(open(
            os.path.join(result_directory, 'functional_enrichment.csv')),
                              delimiter=',')
        sortedlist = sorted(
            data,
            key=lambda row:
            (float(row['adjusted_p_value']), float(row['raw_p_value']),
             float(row['num_in_ref_genome'])),
            reverse=False)

        for row in sortedlist:
            # if row['num_in_feature_set'] != '0':
            enrichment_table += '<tr><td>{}</td>'.format(row['term_id'])
            enrichment_table += '<td>{}</td>'.format(row['term'])
            enrichment_table += '<td>{}</td>'.format(row['ontology'])
            enrichment_table += '<td>{}</td>'.format(row['num_in_feature_set'])
            enrichment_table += '<td>{}</td>'.format(row['num_in_ref_genome'])
            enrichment_table += '<td>{}</td>'.format(row['raw_p_value'])
            enrichment_table += '<td>{}</td></tr>'.format(
                row['adjusted_p_value'])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<tr>Enrichment_Table</tr>', enrichment_table)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Functional Enrichment App'
        })
        return html_report

    def _get_go_maps_from_genome(self, genome_ref):
        """
        _search_genome: search genome data
        """

        log('start parsing GO terms from genome')

        feature_num = self.gsu.search({'ref': genome_ref})['num_found']

        genome_features = self.gsu.search({
            'ref': genome_ref,
            'limit': feature_num,
            'sort_by': [['feature_id', True]]
        })['features']

        feature_id_go_id_list_map = {}
        go_id_feature_id_list_map = {}
        go_id_go_term_map = {}
        feature_id_feature_info_map = {}

        for genome_feature in genome_features:
            feature_id = genome_feature.get('feature_id')
            feature_func = genome_feature.get('function')
            feature_type = genome_feature.get('feature_type')
            ontology_terms = genome_feature.get('ontology_terms')

            feature_id_feature_info_map.update({
                feature_id: {
                    'function': feature_func,
                    'feature_type': feature_type
                }
            })

            go_id_list = []
            if ontology_terms:
                for ontology_id, ontology_term in ontology_terms.iteritems():
                    if re.match('[gG][oO]\:.*', ontology_id):
                        go_id_go_term_map.update({ontology_id: ontology_term})
                        go_id_list.append(ontology_id)

            if go_id_list:
                feature_id_go_id_list_map.update({feature_id: go_id_list})

                for go_id in go_id_list:
                    if go_id in go_id_feature_id_list_map:
                        feature_ids = go_id_feature_id_list_map.get(go_id)
                        feature_ids.append(feature_id)
                        go_id_feature_id_list_map.update({go_id: feature_ids})
                    else:
                        go_id_feature_id_list_map.update({go_id: [feature_id]})
            else:
                feature_id_go_id_list_map.update({feature_id: 'Unlabeled'})

        return (feature_id_go_id_list_map, go_id_feature_id_list_map,
                go_id_go_term_map, feature_id_feature_info_map)

    def _process_feature_set(self, feature_set_ref):
        """
        _process_feature_set: process FeatureSet object

        return:
        genome_ref: reference Genome object ref
        feature_set_ids: FeatureSet feature ids
        """

        log('start processing FeatureSet object')

        feature_set_data = self.ws.get_objects2(
            {'objects': [{
                'ref': feature_set_ref
            }]})['data'][0]['data']
        feature_elements = feature_set_data['elements']
        feature_set_ids = []
        genome_ref_array = []
        for feature_id, genome_refs in feature_elements.iteritems():
            feature_set_ids.append(feature_id)
            genome_ref_array += genome_refs

        if len(set(genome_ref_array)) > 1:
            error_msg = 'FeatureSet has multiple reference Genomes: {}'.format(
                genome_ref_array)
            raise ValueError(error_msg)

        return feature_set_ids, genome_ref_array[0]

    def _get_immediate_parents(self, ontology_hash, go_id, is_a_relationship,
                               regulates_relationship, part_of_relationship):
        """
        _get_immediate_parents: get immediate parents go_ids for a given go_id
        """
        parent_ids = []
        antology_info = ontology_hash.get(go_id, {})

        if is_a_relationship:
            is_a_parents = antology_info.get('is_a')
            if is_a_parents:
                for parent_string in is_a_parents:
                    is_a_parent_id = parent_string.split('!')[0][:-1]
                    parent_ids.append(is_a_parent_id)

        if regulates_relationship:
            relationship = antology_info.get('relationship')
            if relationship:
                for relationship_string in relationship:
                    if relationship_string.split(' ')[0] == 'regulates':
                        parent_ids.append(relationship_string.split(' ')[1])

        if part_of_relationship:
            relationship = antology_info.get('relationship')
            if relationship:
                for relationship_string in relationship:
                    if relationship_string.split(' ')[0] == 'part_of':
                        parent_ids.append(relationship_string.split(' ')[1])

        return parent_ids

    def _fetch_all_parents_go_ids(self, ontology_hash, go_id,
                                  is_a_relationship, regulates_relationship,
                                  part_of_relationship):
        """
        _fetch_all_parents_go_ids: recusively fetch all parent go_ids
        """

        parent_ids = self._get_immediate_parents(ontology_hash, go_id,
                                                 is_a_relationship,
                                                 regulates_relationship,
                                                 part_of_relationship)
        if parent_ids:
            grand_parent_ids = parent_ids
            for parent_id in parent_ids:
                grand_parent_ids += self._fetch_all_parents_go_ids(
                    ontology_hash, parent_id, is_a_relationship,
                    regulates_relationship, part_of_relationship)[parent_id]
            return {go_id: list(set(grand_parent_ids))}
        else:
            return {go_id: []}

    def _generate_parent_child_map(self,
                                   ontology_hash,
                                   go_ids,
                                   is_a_relationship=True,
                                   regulates_relationship=True,
                                   part_of_relationship=False):
        """
        _generate_parent_child_map: fetch parent go_ids for given go_id
        """

        log('start fetching parent go_ids')
        start = time.time()

        go_id_parent_ids_map = {}

        for go_id in go_ids:
            fetch_result = self._fetch_all_parents_go_ids(
                ontology_hash, go_id, is_a_relationship,
                regulates_relationship, part_of_relationship)

            go_id_parent_ids_map.update(fetch_result)

        end = time.time()
        print('used {:.2f} s'.format(end - start))

        return go_id_parent_ids_map

    def _round(self, number, digits=3):
        """
        round number to given digits
        """

        round_number = format(number, '.{}g'.format(digits))

        return round_number

    def __init__(self, config):
        self.ws_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.gsu = GenomeSearchUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

    def run_fe1(self, params):
        """
        run_fe1: Functional Enrichment One

        required params:
        feature_set_ref: FeatureSet object reference
        workspace_name: the name of the workspace it gets saved to

        optional params:
        propagation: includes is_a relationship to all go terms (default is 1)
        filter_ref_features: filter reference genome features with no go terms (default is 0)
        statistical_significance: parameter for statistical significance.
                                  Select one from left_tailed, right_tailed or two_tailed
                                  (default is left_tailed)
        ignore_go_term_not_in_feature_set: ignore Go term analysis if term is not associated with
                                           FeatureSet (default is 1)

        return:
        result_directory: folder path that holds all files generated by run_deseq2_app
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """
        log('--->\nrunning FunctionalEnrichmentUtil.run_fe1\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_fe1_params(params)
        propagation = params.get('propagation', True)
        filter_ref_features = params.get('filter_ref_features', False)
        statistical_significance = params.get('statistical_significance',
                                              'left_tailed')
        ignore_go_term_not_in_feature_set = params.get(
            'ignore_go_term_not_in_feature_set', True)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        feature_set_ids, genome_ref = self._process_feature_set(
            params.get('feature_set_ref'))

        (feature_id_go_id_list_map, go_id_feature_id_list_map,
         go_id_go_term_map, feature_id_feature_info_map
         ) = self._get_go_maps_from_genome(genome_ref)

        if filter_ref_features:
            log('start filtering featrues with no term')
            feature_ids = []
            for feature_id, go_ids in feature_id_go_id_list_map.iteritems():
                if isinstance(go_ids, list):
                    feature_ids.append(feature_id)
        else:
            feature_ids = feature_id_go_id_list_map.keys()

        ontology_hash = dict()
        ontologies = self.ws.get_objects([{
            'workspace': 'KBaseOntology',
            'name': 'gene_ontology'
        }, {
            'workspace': 'KBaseOntology',
            'name': 'plant_ontology'
        }])
        ontology_hash.update(ontologies[0]['data']['term_hash'])
        ontology_hash.update(ontologies[1]['data']['term_hash'])

        if propagation:
            go_id_parent_ids_map = self._generate_parent_child_map(
                ontology_hash,
                go_id_go_term_map.keys(),
                regulates_relationship=False)
        else:
            go_id_parent_ids_map = {}
            for go_id in go_id_go_term_map.keys():
                go_id_parent_ids_map.update({go_id: []})

        log('including parents to feature id map')
        for go_id, parent_ids in go_id_parent_ids_map.iteritems():
            mapped_features = go_id_feature_id_list_map.get(go_id)

            for parent_id in parent_ids:
                parent_mapped_features = go_id_feature_id_list_map.get(
                    parent_id)

                if not parent_mapped_features:
                    parent_mapped_features = []

                if mapped_features:
                    parent_mapped_features += mapped_features

                go_id_feature_id_list_map.update(
                    {parent_id: list(set(parent_mapped_features))})

        log('start calculating p-values')
        enrichment_map = {}
        go_info_map = {}
        all_raw_p_value = []
        pos = 0
        for go_id, go_term in go_id_go_term_map.iteritems():
            mapped_features = go_id_feature_id_list_map.get(go_id)
            # in feature_set matches go_id
            a = len(set(mapped_features).intersection(feature_set_ids))
            # ignore go term analysis if not associated with FeatureSet
            if ignore_go_term_not_in_feature_set and a == 0:
                continue
            # in feature_set doesn't match go_id
            b = len(feature_set_ids) - a
            # not in feature_set matches go_id
            c = len(mapped_features) - a
            # not in feature_set doesn't match go_id
            d = len(feature_ids) - len(feature_set_ids) - c

            fisher_value = fisher.pvalue(a, b, c, d)
            if statistical_significance == 'left_tailed':
                raw_p_value = self._round(fisher_value.left_tail)
            elif statistical_significance == 'right_tailed':
                raw_p_value = self._round(fisher_value.right_tail)
            elif statistical_significance == 'two_tailed':
                raw_p_value = self._round(fisher_value.two_tail)
            else:
                raise ValueError('Improper statistical_significance value')

            all_raw_p_value.append(raw_p_value)
            go_info_map.update({
                go_id: {
                    'raw_p_value': raw_p_value,
                    'num_in_ref_genome': len(mapped_features),
                    'num_in_subset_feature_set': a,
                    'pos': pos,
                    'mapped_features': mapped_features
                }
            })
            pos += 1

        stats = importr('stats')
        adjusted_p_values = stats.p_adjust(FloatVector(all_raw_p_value),
                                           method='fdr')

        for go_id, go_info in go_info_map.iteritems():
            if go_id not in ontology_hash:
                continue

            adjusted_p_value = self._round(
                adjusted_p_values[go_info.get('pos')])
            namespace = ontology_hash[go_id]['namespace']
            enrichment_map.update({
                go_id: {
                    'raw_p_value':
                    go_info.get('raw_p_value'),
                    'adjusted_p_value':
                    adjusted_p_value,
                    'num_in_ref_genome':
                    go_info.get('num_in_ref_genome'),
                    'num_in_subset_feature_set':
                    go_info.get('num_in_subset_feature_set'),
                    'go_term':
                    go_id_go_term_map.get(go_id),
                    'namespace':
                    namespace.split("_")[1][0].upper(),
                    'mapped_features':
                    go_info.get('mapped_features')
                }
            })

        returnVal = {'result_directory': result_directory}
        report_output = self._generate_report(enrichment_map, result_directory,
                                              params.get('workspace_name'),
                                              feature_id_go_id_list_map,
                                              feature_set_ids, genome_ref,
                                              go_id_parent_ids_map,
                                              feature_ids)

        returnVal.update(report_output)

        return returnVal