def test_to_signed_graph():
    ia = IndraNetAssembler([ab1, ab2, ab3, ab4, bc1, bc2, bc3, bc4])
    df = ia.make_df()
    net = IndraNet.from_df(df)
    signed_graph = net.to_signed_graph(sign_dict=default_sign_dict,
                                       weight_mapping=_weight_mapping)
    assert len(signed_graph.nodes) == 3
    assert len(signed_graph.edges) == 4
    assert set([
        stmt['stmt_type'] for stmt in signed_graph['a']['b'][0]['statements']
    ]) == {'Activation', 'IncreaseAmount'}
    assert set([
        stmt['stmt_type'] for stmt in signed_graph['a']['b'][1]['statements']
    ]) == {'Inhibition'}
    assert set([
        stmt['stmt_type'] for stmt in signed_graph['b']['c'][0]['statements']
    ]) == {'Activation', 'IncreaseAmount'}
    assert set([
        stmt['stmt_type'] for stmt in signed_graph['b']['c'][1]['statements']
    ]) == {'Inhibition', 'DecreaseAmount'}
    assert all(signed_graph.edges[e].get('belief', False)
               for e in signed_graph.edges)
    assert all(
        isinstance(signed_graph.edges[e]['belief'], (float, np.longfloat))
        for e in signed_graph.edges)
    assert all(signed_graph.edges[e].get('weight', False)
               for e in signed_graph.edges)
    assert all(
        isinstance(signed_graph.edges[e]['weight'], (float, np.longfloat))
        for e in signed_graph.edges)
def test_to_digraph():
    ia = IndraNetAssembler([ab1, ab2, ab3, ab4, bc1, bc2, bc3, bc4])
    df = ia.make_df()
    net = IndraNet.from_df(df)
    assert len(net.nodes) == 3
    assert len(net.edges) == 8
    digraph = net.to_digraph(weight_mapping=_weight_mapping)
    assert len(digraph.nodes) == 3
    assert len(digraph.edges) == 2
    assert set([
        stmt['stmt_type'] for stmt in digraph['a']['b']['statements']
    ]) == {'Activation', 'Phosphorylation', 'Inhibition', 'IncreaseAmount'}
    assert all(digraph.edges[e].get('belief', False) for e in digraph.edges)
    assert all(
        isinstance(digraph.edges[e]['belief'], (float, np.longfloat))
        for e in digraph.edges)
    assert all(digraph.edges[e].get('weight', False) for e in digraph.edges)
    assert all(
        isinstance(digraph.edges[e]['weight'], (float, np.longfloat))
        for e in digraph.edges)
    digraph_from_df = IndraNet.digraph_from_df(df)
    assert nx.is_isomorphic(digraph, digraph_from_df)
Beispiel #3
0
def test_from_df():
    ia = IndraNetAssembler([st1, st2, st3, st4, st5, st6, st7])
    df = ia.make_df()
    net = IndraNet.from_df(df)
    assert len(net.nodes) == 6
    assert len(net.edges) == 9
    # Stmt with 1 agent should not be added
    assert 'e' not in net.nodes
    # Complex with more than 3 agents should not be added
    assert ('f', 'g', 0) in net.edges
    assert ('h', 'i', 0) not in net.edges
    # Test node attributes
    assert net.nodes['a']['ns'] == 'HGNC', net.nodes['a']['ns']
    assert net.nodes['a']['id'] == '1'
    # Test edge attributes
    e = net['a']['c'][0]
    assert e['stmt_type'] == 'Inhibition'
    assert e['belief'] == 0.76
    assert e['evidence_count'] == 3
    assert net['b']['d'][0]['evidence_count'] == 0
Beispiel #4
0
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str],
                           date: str,
                           mesh_id_dict: Optional[Dict] = None,
                           graph_type: GraphTypes = 'digraph',
                           include_entity_hierarchies: bool = True,
                           sign_dict: Optional[Dict[str, int]] = None,
                           stmt_types: Optional[List[str]] = None,
                           z_sc_path: Optional[Union[str, pd.DataFrame]] = None,
                           verbosity: int = 0) \
        -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]:
    """Return a NetworkX digraph from a pandas dataframe of a db dump

    Parameters
    ----------
    df : Union[str, pd.DataFrame]
        A dataframe, either as a file path to a file (.pkl or .csv) or a
        pandas DataFrame object.
    date : str
        A date string specifying when the data was dumped from the database.
    mesh_id_dict : dict
        A dict object mapping statement hashes to all mesh ids sharing a 
        common PMID
    graph_type : str
        Return type for the returned graph. Currently supports:
            - 'digraph': DiGraph (Default)
            - 'multidigraph': MultiDiGraph
            - 'signed': Tuple[DiGraph, MultiDiGraph]
            - 'signed-expanded': Tuple[DiGraph, MultiDiGraph]
            - 'digraph-signed-types':  DiGraph
    include_entity_hierarchies : bool
        If True, add edges between nodes if they are related ontologically
        with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge
        is added between the nodes BRCA and BRCA1. Default: True. Note that
        this option only is available for the options directed/unsigned graph
        and multidigraph.
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        A list of statement types to epxand out to other signs
    z_sc_path:
        If provided, must be or be path to a square dataframe with HGNC symbols
        as names on the axes and floats as entries
    verbosity: int
        Output various messages if > 0. For all messages, set to 4.

    Returns
    -------
    Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]]
        The type is determined by the graph_type argument
    """
    graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded',
                     'digraph-signed-types')
    if graph_type.lower() not in graph_options:
        raise ValueError(f'Graph type {graph_type} not supported. Can only '
                         f'chose between {graph_options}')
    sign_dict = sign_dict if sign_dict else default_sign_dict

    graph_type = graph_type.lower()
    date = date if date else datetime.now().strftime('%Y-%m-%d')

    if isinstance(df, str):
        sif_df = file_opener(df)
    else:
        sif_df = df

    if z_sc_path is not None:
        if isinstance(z_sc_path, str):
            if z_sc_path.endswith('h5'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df = pd.read_hdf(z_sc_path)
            elif z_sc_path.endswith('pkl'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df: pd.DataFrame = file_opener(z_sc_path)
            else:
                raise ValueError(f'Unrecognized file: {z_sc_path}')
        elif isinstance(z_sc_path, pd.DataFrame):
            z_sc_df = z_sc_path
        else:
            raise ValueError('Only file paths and data frames allowed as '
                             'arguments to z_sc_path')
    else:
        z_sc_df = None

    # If signed types: filter out rows that of unsigned types
    if graph_type == 'digraph-signed-types':
        sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())]

    sif_df = sif_dump_df_merger(sif_df,
                                graph_type,
                                sign_dict,
                                stmt_types,
                                mesh_id_dict,
                                verbosity=verbosity)

    # Map ns:id to node name
    logger.info('Creating dictionary mapping (ns,id) to node name')
    ns_id_name_tups = set(zip(
        sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)))
    ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups}

    # Map hashes to edge for non-signed graphs
    if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}:
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        hash_edge_dict = {
            h: (a, b)
            for a, b, h in zip(sif_df.agA_name, sif_df.agB_name,
                               sif_df.stmt_hash)
        }

    # Create graph from df
    if graph_type == 'multidigraph':
        indranet_graph = IndraNet.from_df(sif_df)
    elif graph_type in ('digraph', 'digraph-signed-types'):
        # Flatten
        indranet_graph = IndraNet.digraph_from_df(sif_df,
                                                  'complementary_belief',
                                                  _weight_mapping)
    elif graph_type in ('signed', 'signed-expanded'):
        signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df(
            df=sif_df,
            flattening_method='complementary_belief',
            weight_mapping=_weight_mapping)
        signed_node_graph: DiGraph = signed_edges_to_signed_nodes(
            graph=signed_edge_graph, copy_edge_data=True)
        signed_edge_graph.graph['date'] = date
        signed_node_graph.graph['date'] = date
        signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename
        signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename

        # Get hash to signed edge mapping
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_edge_graph.edges:
            for es in signed_edge_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    seg_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    seg_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict

        sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_node_graph.edges:
            for es in signed_node_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    sng_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    sng_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict
        if z_sc_df is not None:
            # Set z-score attributes
            add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df)
            add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df)

        return signed_edge_graph, signed_node_graph
    else:
        raise ValueError(f'Unrecognized graph type {graph_type}. Must be one '
                         f'of: {", ".join(graph_options)}')

    if z_sc_df is not None:
        # Set z-score attributes
        add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df)

    # Add hierarchy relations to graph (not applicable for signed graphs)
    if include_entity_hierarchies and graph_type in ('multidigraph',
                                                     'digraph'):
        from depmap_analysis.network_functions.famplex_functions import \
            get_all_entities
        logger.info('Fetching entity hierarchy relationships')
        full_entity_list = get_all_entities()
        logger.info('Adding entity hierarchy manager as graph attribute')
        node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list}
        added_pairs = set()  # Save (A, B, URI)
        logger.info('Building entity relations to be added to data frame')
        entities = 0
        non_corr_weight = None
        if z_sc_df is not None:
            # Get non-corr weight
            for edge in indranet_graph.edges:
                if indranet_graph.edges[edge]['z_score'] == 0:
                    non_corr_weight = indranet_graph.edges[edge]['corr_weight']
                    break
            assert non_corr_weight is not None
            z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight}
        else:
            z_sc_attrs = {}

        for ns, _id, uri in full_entity_list:
            node = _id
            # Get name in case it's different than id
            if ns_id_to_nodename.get((ns, _id), None):
                node = ns_id_to_nodename[(ns, _id)]
            else:
                ns_id_to_nodename[(ns, _id)] = node

            # Add famplex edge
            for pns, pid in bio_ontology.get_parents(ns, _id):
                puri = get_identifiers_url(pns, pid)
                pnode = pid
                if ns_id_to_nodename.get((pns, pid), None):
                    pnode = ns_id_to_nodename[(pns, pid)]
                else:
                    ns_id_to_nodename[(pns, pid)] = pnode
                # Check if edge already exists
                if (node, pnode, puri) not in added_pairs:
                    entities += 1
                    # Belief and evidence are conditional
                    added_pairs.add((node, pnode, puri))  # A, B, uri of B
                    ed = {
                        'agA_name': node,
                        'agA_ns': ns,
                        'agA_id': _id,
                        'agB_name': pnode,
                        'agB_ns': pns,
                        'agB_id': pid,
                        'stmt_type': 'fplx',
                        'evidence_count': 1,
                        'source_counts': {
                            'fplx': 1
                        },
                        'stmt_hash': puri,
                        'belief': 1.0,
                        'weight': MIN_WEIGHT,
                        'curated': True,
                        'english': f'{pns}:{pid} is an ontological parent '
                        f'of {ns}:{_id}',
                        'z_score': 0,
                        'corr_weight': 1
                    }
                    # Add non-existing nodes
                    if ed['agA_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agA_name'],
                                                ns=ed['agA_ns'],
                                                id=ed['agA_id'])
                    if ed['agB_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agB_name'],
                                                ns=ed['agB_ns'],
                                                id=ed['agB_id'])
                    # Add edges
                    ed.pop('agA_id')
                    ed.pop('agA_ns')
                    ed.pop('agB_id')
                    ed.pop('agB_ns')
                    if indranet_graph.is_multigraph():
                        # MultiDiGraph
                        indranet_graph.add_edge(ed['agA_name'], ed['agB_name'],
                                                **ed)
                    else:
                        # DiGraph
                        u = ed.pop('agA_name')
                        v = ed.pop('agB_name')

                        # Check edge
                        if indranet_graph.has_edge(u, v):
                            indranet_graph.edges[(u,
                                                  v)]['statements'].append(ed)
                        else:
                            indranet_graph.add_edge(u,
                                                    v,
                                                    belief=1.0,
                                                    weight=1.0,
                                                    statements=[ed],
                                                    **z_sc_attrs)

        logger.info('Loaded %d entity relations into dataframe' % entities)
        indranet_graph.graph['node_by_uri'] = node_by_uri
    indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename
    indranet_graph.graph['edge_by_hash'] = hash_edge_dict
    indranet_graph.graph['date'] = date
    return indranet_graph
def sif_dump_df_to_nx_digraph(df,
                              strat_ev_dict,
                              belief_dict,
                              multi=False,
                              include_entity_hierarchies=True,
                              verbosity=0):
    """Return a NetworkX digraph from a pandas dataframe of a db dump

    Parameters
    ----------
    df : str|pd.DataFrame
        A dataframe, either as a file path to a pickle or a pandas
        DataFrame object
    belief_dict : str|dict
        The file path to a belief dict that is keyed by statement hashes
        corresponding to the statement hashes loaded in df
    strat_ev_dict : str
        The file path to a dict keyed by statement hashes containing the
        stratified evidence count per statement
    multi : bool
        Default: False; Return an nx.MultiDiGraph if True, otherwise
        return an nx.DiGraph
    include_entity_hierarchies : bool
        Default: True
    verbosity: int
        Output various messages if > 0. For all messages, set to 4

    Returns
    -------
    indranet_graph : nx.DiGraph or nx.MultiDiGraph
        By default an nx.DiGraph is returned. By setting multi=True,
        an nx.MultiDiGraph is returned instead."""
    sed = None
    readers = {'medscan', 'rlimsp', 'trips', 'reach', 'sparser', 'isi'}

    def _curated_func(ev_dict):
        return False if not ev_dict else \
            (False if all(s.lower() in readers for s in ev_dict) else True)

    def _weight_from_belief(belief):
        """Map belief score 'belief' to weight. If the calculation goes below
        precision, return longfloat precision insted to avoid making the
        weight zero."""
        return np.max([NP_PRECISION, -np.log(belief, dtype=np.longfloat)])

    def _weight_mapping(G):
        """Mapping function for adding the weight of the flattened edges

        Parameters
        ----------
        G : IndraNet
            Incoming graph

        Returns
        -------
        G : IndraNet
        """
        for edge in G.edges:
            G.edges[edge]['weight'] = \
                _weight_from_belief(G.edges[edge]['belief'])
        return G

    if isinstance(df, str):
        sif_df = pickle_open(df)
    else:
        sif_df = df

    if 'hash' in sif_df.columns:
        sif_df.rename(columns={'hash': 'stmt_hash'}, inplace=True)

    if isinstance(belief_dict, str):
        belief_dict = pickle_open(belief_dict)
    elif isinstance(belief_dict, dict):
        belief_dict = belief_dict

    if isinstance(strat_ev_dict, str):
        sed = pickle_open(strat_ev_dict)
    elif isinstance(strat_ev_dict, dict):
        sed = strat_ev_dict

    # Extend df with these columns:
    #   belief score from provided dict
    #   stratified evidence count by source
    # Extend df with famplex rows
    # 'stmt_hash' must exist as column in the input dataframe for merge to work
    # Preserve all rows in sif_df, so do left join:
    # sif_df.merge(other, how='left', on='stmt_hash')

    hashes = []
    beliefs = []
    for k, v in belief_dict.items():
        hashes.append(k)
        beliefs.append(v)

    sif_df = sif_df.merge(right=pd.DataFrame(data={
        'stmt_hash': hashes,
        'belief': beliefs
    }),
                          how='left',
                          on='stmt_hash')
    # Check for missing hashes
    if sif_df['belief'].isna().sum() > 0:
        logger.warning('%d rows with missing belief score found' %
                       sif_df['belief'].isna().sum())
        if verbosity > 1:
            logger.info(
                'Missing hashes in belief dict: %s' %
                list(sif_df['stmt_hash'][sif_df['belief'].isna() == True]))
        logger.info('Setting missing belief scores to 1/evidence count')

    hashes = []
    strat_dicts = []
    for k, v in sed.items():
        hashes.append(k)
        strat_dicts.append(v)

    sif_df = sif_df.merge(right=pd.DataFrame(data={
        'stmt_hash': hashes,
        'source_counts': strat_dicts
    }),
                          how='left',
                          on='stmt_hash')
    # Check for missing hashes
    if sif_df['source_counts'].isna().sum() > 0:
        logger.warning('%d rows with missing evidence found' %
                       sif_df['source_counts'].isna().sum())
        if verbosity > 1:
            logger.info('Missing hashes in stratified evidence dict: %s' %
                        list(sif_df['stmt_hash'][
                            sif_df['source_counts'].isna() == True]))
    # Map ns:id to node name
    logger.info('Creating dictionary with mapping from (ns,id) to node name')
    ns_id_name_tups = set(zip(
        sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)))
    ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups}

    logger.info('Setting "curated" flag')
    # Map to boolean 'curated' for reader/non-reader
    sif_df['curated'] = sif_df['source_counts'].apply(func=_curated_func)

    logger.info('Setting edge weights')
    # Add weight: -log(belief) or 1/evidence count if no belief
    has_belief = (sif_df['belief'].isna() == False)
    has_no_belief = (sif_df['belief'].isna() == True)
    sif_df['weight'] = 0
    if has_belief.sum() > 0:
        sif_df.loc[has_belief,
                   'weight'] = sif_df['belief'].apply(func=_weight_from_belief)
    if has_no_belief.sum() > 0:
        sif_df.loc[has_no_belief, 'weight'] = sif_df['evidence_count'].apply(
            func=lambda ec: 1 / np.longfloat(ec))

    # Create graph from df
    if multi:
        indranet_graph = IndraNet.from_df(sif_df)
    else:
        # Flatten
        indranet_graph = IndraNet.digraph_from_df(sif_df,
                                                  'complementary_belief',
                                                  _weight_mapping)

    # Add hierarchy relations to graph
    if include_entity_hierarchies:
        logger.info('Fetching entity hierarchy relationsships')
        full_entity_list = fplx_fcns.get_all_entities()
        ehm = hm.hierarchies['entity']
        ehm.initialize()
        logger.info('Adding entity hierarchy manager as graph attribute')
        node_by_uri = {}
        added_pairs = set()  # Save (A, B, URI)
        logger.info('Building entity relations to be added to data frame')
        entities = 0
        for ns, _id, uri in full_entity_list:
            node = _id
            # Get name in case it's different than id
            if ns_id_to_nodename.get((ns, _id), None):
                node = ns_id_to_nodename[(ns, _id)]
            else:
                ns_id_to_nodename[(ns, _id)] = node
            node_by_uri[uri] = node

            # Add famplex edge
            for puri in ehm.get_parents(uri):
                pns, pid = ehm.ns_id_from_uri(puri)
                pnode = pid
                if ns_id_to_nodename.get((pns, pid), None):
                    pnode = ns_id_to_nodename[(pns, pid)]
                else:
                    ns_id_to_nodename[(pns, pid)] = pnode
                node_by_uri[puri] = pnode
                # Check if edge already exists
                if (node, pnode, puri) not in added_pairs:
                    entities += 1
                    # Belief and evidence are conditional
                    added_pairs.add((node, pnode, puri))  # A, B, uri of B
                    ed = {
                        'agA_name': node,
                        'agA_ns': ns,
                        'agA_id': _id,
                        'agB_name': pnode,
                        'agB_ns': pns,
                        'agB_id': pid,
                        'stmt_type': 'fplx',
                        'evidence_count': 1,
                        'source_counts': {
                            'fplx': 1
                        },
                        'stmt_hash': puri,
                        'belief': 1.0,
                        'weight': NP_PRECISION,
                        'curated': True
                    }
                    # Add non-existing nodes
                    if ed['agA_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agA_name'],
                                                ns=ed['agA_ns'],
                                                id=ed['agA_id'])
                    if ed['agB_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agB_name'],
                                                ns=ed['agB_ns'],
                                                id=ed['agB_id'])
                    # Add edges
                    ed.pop('agA_id')
                    ed.pop('agA_ns')
                    ed.pop('agB_id')
                    ed.pop('agB_ns')
                    if indranet_graph.is_multigraph():
                        # MultiDiGraph
                        indranet_graph.add_edge(ed['agA_name'], ed['agB_name'],
                                                **ed)
                    else:
                        # DiGraph
                        u = ed.pop('agA_name')
                        v = ed.pop('agB_name')

                        # Check edge
                        if indranet_graph.has_edge(u, v):
                            indranet_graph.edges[(u,
                                                  v)]['statements'].append(ed)
                        else:
                            indranet_graph.add_edge(u,
                                                    v,
                                                    belief=1.0,
                                                    weight=1.0,
                                                    statements=[ed])

        logger.info('Loaded %d entity relations into dataframe' % entities)
        indranet_graph.graph['node_by_uri'] = node_by_uri
    indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename
    return indranet_graph