Ejemplo n.º 1
0
    def load_reactome(
        self,
    ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]]:
        """Load and return the reactome data used in script

        The loaded data is expected to be a tuple or list of dicts. The
        first dict is expected to contain mappings from UP IDs of genes to
        Reactome pathway IDs. The second dict is expected to contain the
        reverse mapping (i.e Reactome IDs to UP IDs). The third dict is
        expected to contain mappings from the Reactome IDs to their
        descriptions.

        Returns
        -------
        Tuple[Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]]
        """
        if self.reactome_filepath is not None:
            reactome = file_opener(self.reactome_filepath)
            assert isinstance(reactome, (tuple, list)), (
                f"{self.reactome_filepath} does not seem to contain tuple "
                f"of (upid - pathway mapping, pathway - upid mapping, "
                f"pathway id - pathway description).")
        else:
            raise FileNotFoundError(
                "No reactome file location seems to be present in script settings."
            )

        return reactome
Ejemplo n.º 2
0
def _loop_explainers(expl_path: str):
    # Store explainer data by their graph type
    expl_by_type = {'pybel': [], 'signed': [], 'unsigned': []}
    for explainer_file in tqdm(get_dir_iter(expl_path, '.pkl')):
        expl: DepMapExplainer = file_opener(explainer_file)
        expl_data = _get_expl_data(expl)
        expl_by_type[expl.script_settings['graph_type']].append(expl_data)

    return expl_by_type
Ejemplo n.º 3
0
 def _get_raw(raw: PathObj):
     if isinstance(raw, str):
         logger.info(f'Reading raw DepMap data from {raw}')
         # a)
         raw_df = io.file_opener(raw, index_col=0)
     elif isinstance(raw, pd.DataFrame):
         raw_df = raw
     else:
         raw_df = None
     return raw_df
Ejemplo n.º 4
0
    def load_graph(self) -> Union[nx.DiGraph, nx.MultiDiGraph]:
        """Load and return the graph used in script

        Returns
        -------
        Union[nx.DiGraph, nx.MultiDiGraph]
        """
        graph = file_opener(self.graph_filepath)
        assert isinstance(graph, (nx.DiGraph, nx.MultiDiGraph))

        return graph
def drugs_to_corr_matrix(raw_file: str, info_file: str):
    """Preprocess and create a correlation matrix from raw drug data

    Parameters
    ----------
    raw_file : str
        Path to DepMap PRISM drug repurposing data file. Should match
        primary-screen-replicate-collapsed-logfold-change.csv
    info_file : str
        Path to DepMap PRISM drug repurposing info file. Should match
        primary-screen-replicate-collapsed-treatment-info.csv
    """
    def _get_drug_name(drug_id):
        drug_rec = info_df.loc[drug_id]
        return drug_rec['name']

    raw_df: pd.DataFrame = io.file_opener(raw_file, index_col=0)
    info_df: pd.DataFrame = io.file_opener(info_file, index_col=0)
    col_names = [_get_drug_name(did) for did in raw_df.columns]
    raw_df.columns = col_names

    return raw_depmap_to_corr(raw_df)
Ejemplo n.º 6
0
def test_reactome_expl():
    up2path, _, pathid2pathname = file_opener(
        's3://depmap-analysis/misc_files/reactome_pathways.pkl')
    reactome_dict = {'uniprot_mapping': up2path,
                     'pathid_name_mapping': pathid2pathname}

    react_func: Callable = expl_functions[react_funcname]
    up1 = 'A0A075B6P5'
    up2 = 'A5LHX3'
    res = {'R-HSA-2871837'}
    descr = ['FCERI mediated NF-kB activation']
    assert res == set(up2path[up1]) & set(up2path[up2])

    hgnc_id1 = reverse_uniprot[up1]
    hgnc_name1 = hgnc_names[hgnc_id1]
    hgnc_id2 = reverse_uniprot[up2]
    hgnc_name2 = hgnc_names[hgnc_id2]
    func_args = (hgnc_name1, hgnc_name2, 0.0, nx.DiGraph(), 'unsigned',
                 reactome_dict)
    s, o, explained, data = react_func(*func_args)
    assert explained
    assert data == descr, str(data or 'None returned')
Ejemplo n.º 7
0
    parser.add_argument('--z-corr',
                        type=file_path('h5'),
                        required=True,
                        help='The path to the stored correlation matrix as '
                        'a pandas DataFrame')
    parser.add_argument('--reactome',
                        type=file_path('pkl'),
                        required=True,
                        help='The reactome pickle')
    args = parser.parse_args()
    z_sc_file = Path(args.z_corr)
    reactome_file = Path(args.reactome)
    sd_ranges = [('rnd', None), (2, 3), (3, 4), (4, 5), (5, None)]

    # Only need first dict
    reactome_mapping = file_opener(reactome_file)[0]

    # Load corr matrix
    z_sc_full = pd.read_hdf(z_sc_file)
    assert isinstance(z_sc_full, pd.DataFrame)

    all_stats = {
        'range': [],
        'checked': [],
        'has_pathways': [],
        'has_pathways_norm': []
    }
    data_frames = {}

    for ll, ul in sd_ranges:
        # Filter matrix
    indexer = count(0)
    processed_explainers = []
    for explainer_file in input_iter:
        logger.info(
            f'> > > > '
            f'Processing {explainer_file} '
            f'{datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")} (UTC)'
            f' < < < <'
        )
        explainer_out = _joinpath(
            output_dir, str(explainer_file).split('/')[-1].split('.')[0]
        )
        logger.info(f'Saving output to {explainer_out}')
        if not dry:
            # Load pickle
            explainer = file_opener(explainer_file)
            try:
                assert isinstance(explainer, DepMapExplainer)
            except AssertionError:
                logger.warning(f'File {explainer_file} is not '
                               f'DepMapExplainer, skipping...')
                continue
            # Backwards compatibility: check if s3_location attribute
            # exists, otherwise set it and then re-upload. If attribute
            # exists but is not set, set it and re-upload
            try:
                if not explainer.s3_location:
                    _save(fpath=explainer_file, expl_inst=explainer)
            except AttributeError:
                _save(fpath=explainer_file, expl_inst=explainer)
Ejemplo n.º 9
0
    def get_corr_stats_axb(
        self,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        max_proc: Optional[int] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
    ) -> Results:
        """Get statistics of the correlations from different explanation types

        Note: the provided options have no effect if the data is loaded
        from cache.

        Parameters
        ----------
        z_corr : Optional[Union[pd.DataFrame, str]]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. Pro
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing
            in get_corr_stats_mp. Default: multiprocessing.cpu_count()
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000. If the
            number of pairs to check is smaller than 10 000, no sampling is
            done.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.

        Returns
        -------
        Results
            A BaseModel containing correlation data for different explanations
        """
        if not self.corr_stats_axb:
            s3 = get_s3_client(unsigned=False)
            try:
                corr_stats_loc = self.get_s3_corr_stats_path()
                if S3Path.from_string(corr_stats_loc).exists(s3):
                    logger.info(f"Found corr stats data at {corr_stats_loc}")
                    corr_stats_json = file_opener(corr_stats_loc)
                    self.corr_stats_axb = Results(**corr_stats_json)
                else:
                    logger.info(f"No corr stats data at found at "
                                f"{corr_stats_loc}")
            except ValueError as ve:
                # Raised when s3 location is not set
                logger.warning(ve)

            # If not found on s3 or ValueError was raised
            if not self.corr_stats_axb:
                logger.info("Generating corr stats data")
                # Load correlation matrix
                if z_corr is None:
                    z_corr = self.load_z_corr()
                if isinstance(z_corr, str):
                    z_corr = self.load_z_corr(local_file_path=z_corr)
                # Load reactome if present
                try:
                    reactome = self.load_reactome()
                except FileNotFoundError:
                    logger.info("No reactome file used in script")
                    reactome = None
                self.corr_stats_axb: Results = axb_stats(
                    self.expl_df,
                    self.stats_df,
                    z_corr=z_corr,
                    reactome=reactome,
                    eval_str=False,
                    max_proc=max_proc,
                    max_corr_pairs=max_so_pairs_size,
                    do_mp_pairs=mp_pairs,
                    run_linear=run_linear,
                )
                try:
                    corr_stats_loc = self.get_s3_corr_stats_path()
                    logger.info(f"Uploading corr stats to S3 at "
                                f"{corr_stats_loc}")
                    s3p_loc = S3Path.from_string(corr_stats_loc)
                    s3p_loc.put(s3=s3, body=self.corr_stats_axb.json())
                    logger.info("Finished uploading corr stats to S3")
                except ValueError:
                    logger.warning("Unable to upload corr stats to S3")
        else:
            logger.info("Data already present in corr_stats_axb")
        return self.corr_stats_axb
Ejemplo n.º 10
0
def main(indra_net: str,
         z_score: str,
         outname: str,
         graph_type: str,
         sd_range: Tuple[float, Union[None, float]],
         random: bool = False,
         raw_data: Optional[List[str]] = None,
         raw_corr: Optional[List[str]] = None,
         expl_funcs: Optional[List[str]] = None,
         pb_node_mapping: Optional[Dict[str, Set]] = None,
         n_chunks: Optional[int] = 256,
         is_a_part_of: Optional[List[str]] = None,
         immediate_only: Optional[bool] = False,
         return_unexplained: Optional[bool] = False,
         reactome_path: Optional[str] = None,
         subset_list: Optional[List[Union[str, int]]] = None,
         apriori_explained: Optional[Union[bool, str]] = False,
         allowed_ns: Optional[List[str]] = None,
         allowed_sources: Optional[List[str]] = None,
         info: Optional[Dict[Hashable, Any]] = None,
         indra_date: Optional[str] = None,
         depmap_date: Optional[str] = None,
         sample_size: Optional[int] = None,
         shuffle: Optional[bool] = False,
         overwrite: Optional[bool] = False,
         normalize_names: Optional[bool] = False,
         argparse_dict: Optional[Dict[str, Union[str, float, int,
                                                 List[str]]]] = None):
    """Set up correlation matching of depmap data with an indranet graph

    Parameters
    ----------
    indra_net : Union[nx.DiGraph, nx.MultiDiGraph]
        The graph representation of the indra network. Each edge should
        have an attribute named 'statements' containing a list of sources
        supporting that edge. If signed search, indranet is expected to be an
        nx.MultiDiGraph with edges keyed by (gene, gene, sign) tuples.
    outname : str
        A file path (can be an S3 url) to where to store the final pickle
        file containing the DepmapExplainer
    graph_type : str
        The graph type of the graph used for the explanations. Can be one of
        'unsigned', 'signed', 'pybel'.
    sd_range : Tuple[float, Union[float, None]]
        A tuple of the lower and optionally the upper bound of the z-score
        range to use when getting correlations
    random : bool
        Whether to do a random sampling or not. If True do a random sample
        instead of cutting the correlations of to the given SD range.
    z_score : Union[pd.DataFrame, str]
        The path to the correlation DataFrame. If either raw data or raw
        corr are used, this filepath will be used to save the resulting
        DataFrame instead.
    raw_data : Optional[List[str]]
        File paths to CRISPR raw data and RNAi raw data from the DepMap Portal
    raw_corr : Optional[List[str]]
        File paths to raw correlation data (before z-score conversion)
        containing hdf compressed correlation data. These files contain the
        result of running `raw_df.corr()`.
    expl_funcs : Optional[List[str]]
        Provide a list of explanation functions to apply. Default: All
        functions are applied. Currently available functions:
        - 'expl_ab': Explain pair by checking for an edge between a and b
        - 'expl_ba': Explain pair by checking for an edge between b and a
        - 'expl_axb': Explain pair by looking for intermediate nodes
          connecting a to b
        - 'expl_bxa': Explain pair by looking for intermediate nodes
          connecting b to a
        - 'get_sr': Explain pair by finding common upstream nodes
        - 'get_st': Explain pair by finding common downstream nodes
        - 'get_sd': Explain pair by finding common downstream nodes two
          edges from s and o
        - 'find_cp': Explain pair by looking for ontological parents
        - 'apriori_explained': Map entities to a-priori explanations
        - 'common_reactome_paths': Explain pair by matching common reactome
          pathways
    pb_node_mapping : Optional[Union[Dict, Set[Any]]]
        If graph type is "pybel", use this argument to provide a mapping
        from HGNC symbols to pybel nodes in the pybel model
    n_chunks : Optional[int]
        How many chunks to split the data into in the multiprocessing part
        of the script
    is_a_part_of : Optional[Iterable]
        A set of identifiers to look for when applying the common parent
        explanation between a pair of correlating nodes.
    immediate_only : Optional[bool]
        Only look for immediate parents. This option might limit the number
        of results that are returned. Default: False.
    return_unexplained : Optional[bool]
        If True: return explanation data even if there is no set
        intersection of nodes up- or downstream of A, B for shared
        regulators and shared targets. Default: False.
    reactome_path : Optional[str]
        File path to reactome data.
    subset_list :  Optional[List[Union[str, int]]]
        Provide a list if entities that defines a subset of the entities in
        the correlation data frame that will be picked as 'a' when the pairs
        (a, b) are generated
    apriori_explained : Optional[str]
        A mapping from entity names to a string containing a short
        explanation of why the entity is explained. To use the default
        MitoCarta 3.0 file, run the following code:
        >>> from depmap_analysis.scripts.depmap_script2 import mito_file
        >>> from depmap_analysis.preprocessing import get_mitocarta_info
        >>> apriori_mapping = get_mitocarta_info(mito_file)
        then pass `apriori_mapping` as `apriori_explained` when calling this
        function:
        >>> main(apriori_explained=apriori_mapping, ...)
    allowed_ns : Optional[List[str]]
        A list of allowed name spaces for explanations involving
        intermediary nodes. Default: Any namespace.
    allowed_sources : Optional[List[str]]
        The allowed sources for edges. This will not affect subsequent edges
        in explanations involving 2 or more edges. Default: all sources are
        allowed.
    info : Optional[Dict[Hashable, Any]]
        An optional dict in which to save meta data about this run
    indra_date : Optional[str]
        The date of the sif dump used to create the graph
    depmap_date : Optional[str]
        The date (usually a quarter e.g. 19Q4) the depmap data was published
        on depmap.org
    sample_size : Optional[int]
        Number of correlation pairs to approximately get out of the
        correlation matrix after down sampling it
    shuffle : Optional[bool]
        If True, shuffle the correlation matrix. This is good to do in case
        the input data have some sort of structure that could lead to large
        discrepancies between compute times for the different processes.
        Default: False.
    overwrite : Optional[bool]
        If True, overwrite any output files. Default: False.
    normalize_names : Optional[bool]
        If True, try to normalize the names in the correlation matrix that
        are not found in the provided graph. Default: False.
    argparse_dict : Optional[Dict[str, Union[str, float, int, List[str]]]]
        Provide the argparse options from running this file as a script
    """
    global indranet, hgnc_node_mapping, output_list
    indranet = file_opener(indra_net)
    assert isinstance(indranet, nx.DiGraph)

    assert expl_funcs is None or isinstance(expl_funcs, (list, tuple, set))

    # 1 Check options
    sd_l, sd_u = sd_range if sd_range and len(sd_range) == 2 else \
        ((sd_range[0], None) if sd_range and len(sd_range) == 1 else
         (None, None))

    if not random and not sd_l and not sd_u:
        raise ValueError('Must specify at least a lower bound for the SD '
                         'range or flag run for random explanation')

    if graph_type == 'pybel' and not pb_node_mapping:
        raise ValueError('Must provide PyBEL node mapping with option '
                         'pb_node_mapping if graph type is "pybel"')

    if apriori_explained:
        if apriori_explained is True or mito_file_name in apriori_explained:
            # Run default
            apriori_explained = get_mitocarta_info(mito_file)
        else:
            # Hope it's a csv/tsv
            try:
                expl_df = pd.read_csv(apriori_explained)
                apriori_explained = {
                    e: d
                    for e, d in zip(expl_df.name, expl_df.description)
                }
            except Exception as err:
                raise ValueError('A-priori explained entities must be in a '
                                 'file that can be parsed as CSV/TSV with '
                                 'column names "name" for entity name and '
                                 '"description" for explanation why the '
                                 'entity is explained.') \
                    from err

        logger.info(f'Using explained set with '
                    f'{len(apriori_explained)} entities')

    outname = outname if outname.endswith('.pkl') else \
        outname + '.pkl'
    if not overwrite:
        if outname.startswith('s3://'):
            s3 = get_s3_client(unsigned=False)
            if S3Path.from_string(outname).exists(s3):
                raise FileExistsError(f'File {str(outname)} already exists!')
        elif Path(outname).is_file():
            raise FileExistsError(f'File {str(outname)} already exists!')

    if z_score is not None and Path(z_score).is_file():
        z_corr = pd.read_hdf(z_score)
    else:
        z_sc_options = {
            'crispr_raw': raw_data[0],
            'rnai_raw': raw_data[1],
            'crispr_corr': raw_corr[0],
            'rnai_corr': raw_corr[1],
            'z_corr_path': z_score
        }
        z_corr = run_corr_merge(**z_sc_options)

    if reactome_path:
        up2path, _, pathid2pathname = file_opener(reactome_path)
        reactome_dict = {
            'uniprot_mapping': up2path,
            'pathid_name_mapping': pathid2pathname
        }
    else:
        reactome_dict = None

    # Get mapping of correlation names to pybel nodes
    if graph_type == 'pybel':
        if isinstance(pb_node_mapping, dict):
            hgnc_node_mapping = pb_node_mapping
        elif isinstance(pb_node_mapping, str) and \
                Path(pb_node_mapping).is_file():
            hgnc_node_mapping = file_opener(pb_node_mapping)
        else:
            raise ValueError('Could not load pybel node mapping')

    # 2. Filter to SD range OR run random sampling
    if random:
        logger.info('Doing random sampling through df.sample')
        z_corr = z_corr.sample(142, axis=0)
        z_corr = z_corr.filter(list(z_corr.index), axis=1)
        # Remove correlation values to not confuse with real data
        z_corr.loc[:, :] = 0
    else:
        if sd_l and sd_u:
            logger.info(f'Filtering correlations to {sd_l} - {sd_u} SD')
            z_corr = z_corr[((z_corr > sd_l) & (z_corr < sd_u)) |
                            ((z_corr < -sd_l) & (z_corr > -sd_u))]
        elif isinstance(sd_l, (int, float)) and sd_l and not sd_u:
            logger.info(f'Filtering correlations to {sd_l}+ SD')
            z_corr = z_corr[(z_corr > sd_l) | (z_corr < -sd_l)]

    sd_range = (sd_l, sd_u) if sd_u else (sd_l, None)

    # Pick a sample
    if sample_size is not None and not random:
        logger.info(f'Reducing correlation matrix to a random approximately '
                    f'{sample_size} correlation pairs.')
        z_corr = down_sample_df(z_corr, sample_size)

    # Shuffle corr matrix without removing items
    elif shuffle and not random:
        logger.info('Shuffling correlation matrix...')
        z_corr = z_corr.sample(frac=1, axis=0)
        z_corr = z_corr.filter(list(z_corr.index), axis=1)

    if normalize_names:
        logger.info('Normalizing correlation matrix column names')
        z_corr = normalize_corr_names(z_corr, indranet)
    else:
        logger.info('Leaving correlation matrix column names as is')

    # 4. Add meta data
    info_dict = {}
    if info:
        info_dict['info'] = info

    # Set the script_settings
    script_settings = {
        'raw_data':
        raw_data,
        'raw_corr':
        raw_corr,
        'z_score':
        z_score,
        'random':
        random,
        'indranet':
        indra_net,
        'shuffle':
        shuffle,
        'sample_size':
        sample_size,
        'n_chunks':
        n_chunks,
        'outname':
        outname,
        'apriori_explained':
        apriori_explained if isinstance(apriori_explained, str) else 'no info',
        'graph_type':
        graph_type,
        'pybel_node_mapping':
        pb_node_mapping if isinstance(pb_node_mapping, str) else 'no info',
        'argparse_info':
        argparse_dict
    }

    # Create output list in global scope
    output_list = []
    explanations = match_correlations(corr_z=z_corr,
                                      sd_range=sd_range,
                                      script_settings=script_settings,
                                      graph_filepath=indra_net,
                                      z_corr_filepath=z_score,
                                      apriori_explained=apriori_explained,
                                      graph_type=graph_type,
                                      allowed_ns=allowed_ns,
                                      allowed_sources=allowed_sources,
                                      is_a_part_of=is_a_part_of,
                                      expl_funcs=expl_funcs,
                                      reactome_filepath=reactome_path,
                                      indra_date=indra_date,
                                      info=info_dict,
                                      depmap_date=depmap_date,
                                      n_chunks=n_chunks,
                                      immediate_only=immediate_only,
                                      return_unexplained=return_unexplained,
                                      reactome_dict=reactome_dict,
                                      subset_list=subset_list)
    if outname.startswith('s3://'):
        try:
            logger.info(f'Uploading results to s3: {outname}')
            s3 = get_s3_client(unsigned=False)
            s3outpath = S3Path.from_string(outname)
            explanations.s3_location = s3outpath.to_string()
            s3outpath.upload(s3=s3, body=pickle.dumps(explanations))
            logger.info('Finished uploading results to s3')
        except Exception:
            new_path = Path(outname.replace('s3://', ''))
            logger.warning(f'Something went wrong in s3 upload, trying to '
                           f'save locally instead to {new_path}')
            new_path.parent.mkdir(parents=True, exist_ok=True)
            dump_it_to_pickle(fname=new_path.absolute().resolve().as_posix(),
                              pyobj=explanations,
                              overwrite=overwrite)

    else:
        # mkdir in case it doesn't exist
        outpath = Path(outname)
        logger.info(f'Dumping results to {outpath}')
        outpath.parent.mkdir(parents=True, exist_ok=True)
        dump_it_to_pickle(fname=outpath.absolute().resolve().as_posix(),
                          pyobj=explanations,
                          overwrite=overwrite)
    logger.info('Script finished')
    explanations.summarize()
def run_corr_merge(crispr_raw: Optional[Union[str, pd.DataFrame]] = None,
                   rnai_raw: Optional[Union[str, pd.DataFrame]] = None,
                   crispr_corr: Optional[Union[str, pd.DataFrame]] = None,
                   rnai_corr: Optional[Union[str, pd.DataFrame]] = None,
                   output_dir: str = 'correlation_output',
                   remove_self_corr: bool = False,
                   random_sampl: int = 0,
                   save_corr_files: bool = False,
                   z_corr_path: Optional[str] = None):
    """Return a merged correlation matrix from DepMap data

    Start with with either the raw DepMap files or pre-calculated
    correlation matrices

    Parameters
    ----------
    crispr_raw : str|pd.DataFrame
        Path to the raw crispr data. This file is typically named
        'Achilles_gene_effect.csv' at the DepMap portal.
    rnai_raw : str|pd.DataFrame
        Path to the raw RNAi data. This file is typically named
        'D2_combined_gene_dep_scores.csv'
    crispr_corr : str|pd.DataFrame
        Path to the pre-calculated crispr data matrix. This data structure
        is the result from running `crispr_raw_df.corr()`.
    rnai_corr : str|pd.DataFrame
        Path to the pre-calculated rnai data matrix. This data structure
        is the result from running `rnai_raw_df.corr()`.
    output_dir : str
        If used, write the correlation matrices to this directory.
        Otherwise they will be written to the same directory as the raw
        input data.
    remove_self_corr : bool
        If True, remove self correlations from the resulting DataFrame.
        Default: False
    random_sampl : int
        If specified, provides the size of the final correlation matrix
        where the genes are picked at random from the intersection of genes
        from both the RNAI and CRISPR data sets.
    save_corr_files : bool
        If True, save the intermediate correlation data frames for both
        crispr and rnai. Default: True.
    z_corr_path : Optional[str]
        If provided, save the final correlation dataframe here

    Returns
    -------
    pd.DataFrame
        A data frame containing the combined z-score matrix with NaN's
        removed.
    """
    if crispr_raw is None and crispr_corr is None:
        raise ValueError('Need to provide one of crispr_raw or cripsr_corr')
    if rnai_raw is None and rnai_corr is None:
        raise ValueError('Need to provide one of rnai_raw or rnai_corr')

    # First check for correlation matrix, then get it if it doesn't exist
    if crispr_corr:
        if isinstance(crispr_corr, str):
            logger.info(f'Reading crispr correlations from file {crispr_corr}')
            crispr_corr_df = pd.read_hdf(crispr_corr)
        else:
            crispr_corr_df = crispr_corr
    else:
        # Create new one, write to input file's directory
        if isinstance(crispr_raw, str):
            logger.info(f'Reading raw DepMap data from {crispr_raw}')
            crispr_raw_df = io.file_opener(crispr_raw, index_col=0)
        else:
            crispr_raw_df = crispr_raw
        crispr_corr_df = raw_depmap_to_corr(crispr_raw_df, split_names=True,
                                            dropna=False)

        if save_corr_files:
            crispr_fpath = Path(output_dir).joinpath(
                '_crispr_all_correlations.h5')
            logger.info(f'Saving crispr correlation matrix to {crispr_fpath}')
            if not crispr_fpath.parent.is_dir():
                crispr_fpath.parent.mkdir(parents=True, exist_ok=True)
            crispr_corr_df.to_hdf(crispr_fpath.absolute(), 'corr')

    if rnai_corr:
        if isinstance(rnai_corr, str):
            logger.info(f'Reading rnai correlations from file {crispr_corr}')
            rnai_corr_df = pd.read_hdf(rnai_corr)
        else:
            rnai_corr_df = rnai_corr
    else:
        # Create new one, write to input file's directory
        if isinstance(rnai_raw, str):
            logger.info(f'Reading raw DepMap data from {rnai_raw}')
            rnai_raw_df = io.file_opener(rnai_raw, index_col=0)
        else:
            rnai_raw_df = rnai_raw

        # Check if we need to transpose the df
        if len(set(crispr_corr_df.columns.values) &
               set([n.split()[0] for n in rnai_raw_df.columns])) == 0:
            logger.info('Transposing RNAi raw data dataframe...')
            rnai_raw_df = rnai_raw_df.T

        rnai_corr_df = raw_depmap_to_corr(rnai_raw_df, split_names=True,
                                          dropna=False)

        if save_corr_files:
            rnai_fpath = Path(output_dir).joinpath('_rnai_all_correlations.h5')
            if not rnai_fpath.parent.is_dir():
                rnai_fpath.parent.mkdir(parents=True, exist_ok=True)
            logger.info(f'Saving rnai correlation matrix to {rnai_fpath}')
            rnai_corr_df.to_hdf(rnai_fpath.absolute().as_posix(), 'corr')

    # Merge the correlation matrices
    z_cm = merge_corr_df(crispr_corr_df, rnai_corr_df,
                         remove_self_corr)

    if random_sampl and random_sampl < len(z_cm.columns):
        # Get n random rows
        z_cm = z_cm.sample(n=random_sampl)

        # Make square
        z_cm = z_cm[list(z_cm.index.values)]

    assert z_cm.notna().sum().sum() > 0, 'Correlation matrix is empty'

    if z_corr_path:
        zc_path = Path(z_corr_path)
        zc_path.parent.mkdir(parents=True, exist_ok=True)
        z_cm.to_hdf(zc_path)

    return z_cm
Ejemplo n.º 12
0
        type=int,
        default=1,
        help="Set the number of chunks to split the data into to run "
        "multiprocessing. If set to e.g. 4, 4 workers will be started "
        "to run async with multiprocessing.Pool.apply_async. If set to "
        "1 (default), no multiprocessing will be used.",
    )

    args = parser.parse_args()

    # defaults not up to user:
    #   - shuffle the stuff
    #   - explanation functions

    # Load graph
    graph = file_opener(args.graph)

    # Load corr
    logger.info(f"Loading z-score dataframe {args.z_score}")
    z_corr = pd.read_hdf(args.z_score)
    logger.info("Done loading dataframe")

    # Set kwargs
    kwargs = dict(
        indra_net=graph,
        z_score=z_corr,
        graph_type=args.graph_type,
        expl_funcs=[
            "apriori_explained",
            "common_reactome_paths",
            "find_cp",
Ejemplo n.º 13
0
def test_depmap_script():
    up2path, _, pathid2pathname = file_opener(
        's3://depmap-analysis/misc_files/reactome_pathways.pkl')
    reactome_dict = {
        'uniprot_mapping': up2path,
        'pathid_name_mapping': pathid2pathname
    }
    df = get_df()
    idg = get_dg()

    up1 = 'A0A075B6P5'
    up2 = 'A5LHX3'
    hgnc_id1 = reverse_uniprot[up1]
    hgnc_name1 = hgnc_names[hgnc_id1]
    hgnc_id2 = reverse_uniprot[up2]
    hgnc_name2 = hgnc_names[hgnc_id2]

    idg.add_node(hgnc_name1, ns='HGNC', id=hgnc_id1)
    idg.add_node(hgnc_name2, ns='HGNC', id=hgnc_id2)
    not_in_graph = 'not_in_graph'

    # Make correlation matrix with all combinations from the df pairs
    all_names = list(set(df.agA_name.values) | set(df.agB_name.values)) + \
        [not_in_graph, hgnc_name1, hgnc_name2]
    all_names.sort()
    corr_m = _gen_sym_df(len(all_names))
    corr_m.columns = all_names
    corr_m.index = all_names

    func_names = [
        'expl_ab', 'expl_ba', 'expl_axb', 'expl_bxa', 'get_sr', 'get_st',
        react_funcname
    ]

    func_map = {
        funcname_to_colname[fname]: expl_functions[fname]
        for fname in func_names
    }
    bool_columns = ('not_in_graph', 'explained') + tuple(func_map.keys())
    stats_columns = id_columns + bool_columns

    _type = 'unsigned'

    corr_pairs = corr_matrix_to_generator(corr_m)
    stats_dict, expl_dict = _match_correlation_body(
        corr_iter=corr_pairs,
        expl_types=func_map,
        stats_columns=stats_columns,
        expl_cols=expl_columns,
        bool_columns=bool_columns,
        _type=_type,
        return_unexplained=False,
        reactome_dict=reactome_dict,
        local_indranet=idg,
        apriori_explained=None)

    assert set(stats_columns) == set(stats_dict.keys())
    assert set(expl_columns) == set(expl_dict.keys())

    expl_df = pd.DataFrame(expl_dict)
    stats_df = pd.DataFrame(stats_dict)

    # Test content
    # Any connection with not_in_graph should be
    assert all(b for b in
               stats_df[(stats_df.agA == not_in_graph) |
                        (stats_df.agB == not_in_graph)].not_in_graph), \
        str([b for b in stats_df[(stats_df.agA == not_in_graph) |
                                 (stats_df.agB == not_in_graph)].not_in_graph])

    assert all(
        np.isnan(b)
        for b in stats_df[(stats_df.agA == not_in_graph)
                          | (stats_df.agB == not_in_graph)].explained)

    expected = {
        'not_in_graph': False,
        'explained': True,
        ab_colname: False,
        ba_colname: False,
        axb_colname: False,  # Not True, as pairs go alphabetically
        bxa_colname: True,  # True from testing Y2,Z2
        sr_colname: False,
        st_colname: False,
        react_colname: False
    }
    p = 'Y2_Z2'
    res = stats_df[list(bool_columns)][stats_df.pair == p].to_dict(
        orient='records')[0]
    for k, b in res.items():
        assert b == expected[k]

    assert expl_df[(expl_df.pair == p) & (
        expl_df.expl_type == bxa_colname)].expl_data.values[0] == ['X2']
    assert len(expl_df[(expl_df.pair == p)
                       & (expl_df.expl_type == sr_colname)]) == 0
    assert len(expl_df[(expl_df.pair == p)
                       & (expl_df.expl_type == st_colname)]) == 0

    expected = {
        'not_in_graph': False,
        'explained': True,
        ab_colname: False,
        ba_colname: False,
        axb_colname: False,
        bxa_colname: False,
        sr_colname: True,
        st_colname: True,
        react_colname: False
    }
    p = 'X1_X2'
    res: Dict = stats_df[list(bool_columns)][stats_df.pair == p].to_dict(
        orient='records')[0]
    for k, b in res.items():
        assert b == expected[k]

    assert expl_df[(expl_df.pair == p) & (
        expl_df.expl_type == sr_colname)].expl_data.values[0][2] == ['Z2']
    assert expl_df[(expl_df.pair == p) & (
        expl_df.expl_type == st_colname)].expl_data.values[0][2] == ['Z1']

    # Check that reactome is explained, and not counted as among the explained
    len_react = len(stats_df[stats_df[react_colname] == True])
    assert len_react == 1, len_react
    len_react = len(stats_df[(stats_df[react_colname] == True)
                             & (stats_df.explained == False)])
    assert len_react == 1, len_react

    # Test getting interesting df
    interesting_df = get_non_reactome_axb_expl_df(graph=idg,
                                                  stats_df=stats_df,
                                                  expl_df=expl_df,
                                                  z_corr=corr_m)
    assert len(interesting_df) == 5
    assert set(interesting_df.pair) == {'X1_X2', 'Y1_Z2', 'Y2_Z2', 'Z1_Z2'}
Ejemplo n.º 14
0
                           status='pending',
                           fname=meta_name,
                           location=meta_loc,
                           result_location=result_loc)
    logger.info(f'Updating {qh} to pending')
    bgt.add_task(upload_json, job_status)
    bgt.add_task(handle_query, search_query, job_status)
    return job_status


# Change to 'online' after everything is loaded
# asyncio.sleep(5)  # Simulate loading something <- not allowed, can't await
# outside async function
if ROLE == 'UNSIGNED':
    logger.info('Assuming role as unsigned worker')
    indra_graph = file_opener(FILES['dir_graph']) if FILES['dir_graph'] else\
        None
    if isinstance(indra_graph, (DiGraph, MultiGraph)):
        STATUS.graph_stats['unsigned_edges'] = len(indra_graph.edges)
        STATUS.graph_stats['unsigned_nodes'] = len(indra_graph.nodes)
        network_search_api = IndraNetwork(indra_dir_graph=indra_graph)
        network_search_api.verbose = 1
elif ROLE == 'SIGNED':
    logger.info('Assuming role as signed worker')
    indra_seg = file_opener(
        FILES['sign_edge_graph']) if FILES['sign_edge_graph'] else None
    if isinstance(indra_seg, (DiGraph, MultiGraph)):
        STATUS.graph_stats['signed_edge_edges'] = len(indra_seg.edges)
        STATUS.graph_stats['signed_edge_nodes'] = len(indra_seg.nodes)
    indra_sng = file_opener(FILES['sign_node_graph']) if indra_seg and \
        FILES['sign_node_graph'] else None
Ejemplo n.º 15
0
def sif_dump_df_merger(df: pd.DataFrame,
                       graph_type: str,
                       sign_dict: Optional[Dict[str, int]] = None,
                       stmt_types: Optional[List[str]] = None,
                       mesh_id_dict: Optional[Dict[str, str]] = None,
                       set_weights: bool = True,
                       verbosity: int = 0):
    """Merge the sif dump df with the provided dictionaries

    Parameters
    ----------
    df : str|pd.DataFrame
        A dataframe, either as a file path to a pickle or csv, or a pandas
        DataFrame object.
    graph_type : str
        If 'signed-expanded' or 'digraph-signed-types', do extra filtering
        or alteration to the DataFrame to produce an expanded signed graph
        or a reduced digraph with only the signed types
    sign_dict : Optional[Dict[str, int]]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : Optional[List[str]]
        Provide a list of statement types to be used if expanding the signed
        graph to include statements of these types
    mesh_id_dict : dict
        A dict object mapping statement hashes to all mesh ids sharing a
        common PMID
    set_weights : bool
        If True, set the edge weights. Default: True.
    verbosity : int
        Output various extra messages if > 1.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame with new columns from the merge
    """
    if isinstance(df, str):
        merged_df = file_opener(df)
    else:
        merged_df = df

    if 'hash' in merged_df.columns:
        merged_df.rename(columns={'hash': 'stmt_hash'}, inplace=True)

    # Extend df with these columns:
    #   english string from mock statements
    #   mesh_id mapped by dict (if provided)
    #   z-score values (if provided)
    # Extend df with famplex rows
    # 'stmt_hash' must exist as column in the input dataframe for merge to work
    # Preserve all rows in merged_df, so do left join:
    # merged_df.merge(other, how='left', on='stmt_hash')

    if graph_type == 'signed-expanded' and sign_dict and stmt_types:
        merged_df = expand_signed(merged_df, sign_dict, stmt_types)
    elif graph_type == 'signed-expanded' and not (sign_dict and stmt_types):
        raise ValueError('Must provide statement types using variable '
                         '`stmt_types` to run signed_expanded graph')

    if mesh_id_dict is not None:
        hashes = []
        mesh_ids = []
        for k, v in mesh_id_dict.items():
            hashes.append(int(k))
            mesh_ids.append(v)

        merged_df = merged_df.merge(right=pd.DataFrame(data={
            'stmt_hash': hashes,
            'mesh_ids': mesh_ids
        }),
                                    how='left',
                                    on='stmt_hash')

    # Check for missing hashes
    if merged_df['source_counts'].isna().sum() > 0:
        logger.warning('%d rows with missing evidence found' %
                       merged_df['source_counts'].isna().sum())
        if verbosity > 1:
            logger.info('Missing hashes in stratified evidence dict: %s' %
                        list(merged_df['stmt_hash'][
                            merged_df['source_counts'].isna() == True]))

    logger.info('Setting "curated" flag')
    # Map to boolean 'curated' for reader/non-reader
    merged_df['curated'] = merged_df['source_counts'].apply(func=_curated_func)

    # Make english statement
    merged_df['english'] = merged_df.apply(_english_from_row, axis=1)

    if set_weights:
        logger.info('Setting edge weights')
        # Add weight: -log(belief) or 1/evidence count if no belief
        has_belief = (merged_df['belief'].isna() == False)
        has_no_belief = (merged_df['belief'].isna() == True)
        merged_df['weight'] = 0
        if has_belief.sum() > 0:
            merged_df.loc[has_belief, 'weight'] = merged_df['belief'].apply(
                func=_weight_from_belief)
        if has_no_belief.sum() > 0:
            merged_df.loc[has_no_belief, 'weight'] = \
                merged_df['evidence_count'].apply(
                    func=lambda ec: 1/np.longfloat(ec))
    else:
        logger.info('Skipping setting belief weight')

    return merged_df
Ejemplo n.º 16
0
    #                      'or provide node mapping with option '
    #                      'if graph type is pybel')
    # # Only model provided: create mapping
    # if arg_dict.get('pybel_model') and \
    #         not arg_dict.get('pybel_node_mapping'):
    #     mapping = pybel_node_name_mapping(
    #         node_names=hgnc_names, node_ns='HGNC',
    #         pb_model=file_opener(arg_dict['pybel_model'])
    #     )
    #     arg_dict['pb_node_mapping'] = mapping
    # # Mapping is provided: load the mapping
    # elif arg_dict.get('pybel_node_mapping'):
    #     if arg_dict['pybel_node_mapping'].endswith('.pkl'):
    #         arg_dict['pb_node_mapping'] = \
    #             file_opener(arg_dict['pybel_node_mapping'])
    #     elif arg_dict['pybel_node_mapping'].endswith('.json'):
    #         arg_dict['pb_node_mapping'] = \
    #             file_opener(arg_dict['pybel_node_mapping'])
    #     else:
    #         raise ValueError('Unknown file type %s' %
    #                          arg_dict['pybel_node_mapping'].split('.')[-1])

    if args.subset_list:
        df: pd.DataFrame = file_opener(args.subset_list)
        arg_dict['subset_list'] = list(df.name.values)

    main_keys = inspect.signature(main).parameters.keys()
    kwargs = {k: v for k, v in arg_dict.items() if k in main_keys}

    main(**kwargs)
Ejemplo n.º 17
0
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str],
                           date: str,
                           mesh_id_dict: Optional[Dict] = None,
                           graph_type: GraphTypes = 'digraph',
                           include_entity_hierarchies: bool = True,
                           sign_dict: Optional[Dict[str, int]] = None,
                           stmt_types: Optional[List[str]] = None,
                           z_sc_path: Optional[Union[str, pd.DataFrame]] = None,
                           verbosity: int = 0) \
        -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]:
    """Return a NetworkX digraph from a pandas dataframe of a db dump

    Parameters
    ----------
    df : Union[str, pd.DataFrame]
        A dataframe, either as a file path to a file (.pkl or .csv) or a
        pandas DataFrame object.
    date : str
        A date string specifying when the data was dumped from the database.
    mesh_id_dict : dict
        A dict object mapping statement hashes to all mesh ids sharing a 
        common PMID
    graph_type : str
        Return type for the returned graph. Currently supports:
            - 'digraph': DiGraph (Default)
            - 'multidigraph': MultiDiGraph
            - 'signed': Tuple[DiGraph, MultiDiGraph]
            - 'signed-expanded': Tuple[DiGraph, MultiDiGraph]
            - 'digraph-signed-types':  DiGraph
    include_entity_hierarchies : bool
        If True, add edges between nodes if they are related ontologically
        with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge
        is added between the nodes BRCA and BRCA1. Default: True. Note that
        this option only is available for the options directed/unsigned graph
        and multidigraph.
    sign_dict : Dict[str, int]
        A dictionary mapping a Statement type to a sign to be used for the
        edge. By default only Activation and IncreaseAmount are added as
        positive edges and Inhibition and DecreaseAmount are added as
        negative edges, but a user can pass any other Statement types in a
        dictionary.
    stmt_types : List[str]
        A list of statement types to epxand out to other signs
    z_sc_path:
        If provided, must be or be path to a square dataframe with HGNC symbols
        as names on the axes and floats as entries
    verbosity: int
        Output various messages if > 0. For all messages, set to 4.

    Returns
    -------
    Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]]
        The type is determined by the graph_type argument
    """
    graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded',
                     'digraph-signed-types')
    if graph_type.lower() not in graph_options:
        raise ValueError(f'Graph type {graph_type} not supported. Can only '
                         f'chose between {graph_options}')
    sign_dict = sign_dict if sign_dict else default_sign_dict

    graph_type = graph_type.lower()
    date = date if date else datetime.now().strftime('%Y-%m-%d')

    if isinstance(df, str):
        sif_df = file_opener(df)
    else:
        sif_df = df

    if z_sc_path is not None:
        if isinstance(z_sc_path, str):
            if z_sc_path.endswith('h5'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df = pd.read_hdf(z_sc_path)
            elif z_sc_path.endswith('pkl'):
                logger.info(f'Loading z-scores from {z_sc_path}')
                z_sc_df: pd.DataFrame = file_opener(z_sc_path)
            else:
                raise ValueError(f'Unrecognized file: {z_sc_path}')
        elif isinstance(z_sc_path, pd.DataFrame):
            z_sc_df = z_sc_path
        else:
            raise ValueError('Only file paths and data frames allowed as '
                             'arguments to z_sc_path')
    else:
        z_sc_df = None

    # If signed types: filter out rows that of unsigned types
    if graph_type == 'digraph-signed-types':
        sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())]

    sif_df = sif_dump_df_merger(sif_df,
                                graph_type,
                                sign_dict,
                                stmt_types,
                                mesh_id_dict,
                                verbosity=verbosity)

    # Map ns:id to node name
    logger.info('Creating dictionary mapping (ns,id) to node name')
    ns_id_name_tups = set(zip(
        sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union(
            set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name)))
    ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups}

    # Map hashes to edge for non-signed graphs
    if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}:
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        hash_edge_dict = {
            h: (a, b)
            for a, b, h in zip(sif_df.agA_name, sif_df.agB_name,
                               sif_df.stmt_hash)
        }

    # Create graph from df
    if graph_type == 'multidigraph':
        indranet_graph = IndraNet.from_df(sif_df)
    elif graph_type in ('digraph', 'digraph-signed-types'):
        # Flatten
        indranet_graph = IndraNet.digraph_from_df(sif_df,
                                                  'complementary_belief',
                                                  _weight_mapping)
    elif graph_type in ('signed', 'signed-expanded'):
        signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df(
            df=sif_df,
            flattening_method='complementary_belief',
            weight_mapping=_weight_mapping)
        signed_node_graph: DiGraph = signed_edges_to_signed_nodes(
            graph=signed_edge_graph, copy_edge_data=True)
        signed_edge_graph.graph['date'] = date
        signed_node_graph.graph['date'] = date
        signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename
        signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename

        # Get hash to signed edge mapping
        logger.info('Creating dictionary mapping hashes to edges for '
                    'unsigned graph')
        seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_edge_graph.edges:
            for es in signed_edge_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    seg_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    seg_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict

        sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set)
        for edge in signed_node_graph.edges:
            for es in signed_node_graph.edges[edge]['statements']:
                if graph_type == 'signed':
                    sng_hash_edge_dict[es['stmt_hash']] = edge
                else:
                    sng_hash_edge_dict[es['stmt_hash']].add(edge)
        signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict
        if z_sc_df is not None:
            # Set z-score attributes
            add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df)
            add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df)

        return signed_edge_graph, signed_node_graph
    else:
        raise ValueError(f'Unrecognized graph type {graph_type}. Must be one '
                         f'of: {", ".join(graph_options)}')

    if z_sc_df is not None:
        # Set z-score attributes
        add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df)

    # Add hierarchy relations to graph (not applicable for signed graphs)
    if include_entity_hierarchies and graph_type in ('multidigraph',
                                                     'digraph'):
        from depmap_analysis.network_functions.famplex_functions import \
            get_all_entities
        logger.info('Fetching entity hierarchy relationships')
        full_entity_list = get_all_entities()
        logger.info('Adding entity hierarchy manager as graph attribute')
        node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list}
        added_pairs = set()  # Save (A, B, URI)
        logger.info('Building entity relations to be added to data frame')
        entities = 0
        non_corr_weight = None
        if z_sc_df is not None:
            # Get non-corr weight
            for edge in indranet_graph.edges:
                if indranet_graph.edges[edge]['z_score'] == 0:
                    non_corr_weight = indranet_graph.edges[edge]['corr_weight']
                    break
            assert non_corr_weight is not None
            z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight}
        else:
            z_sc_attrs = {}

        for ns, _id, uri in full_entity_list:
            node = _id
            # Get name in case it's different than id
            if ns_id_to_nodename.get((ns, _id), None):
                node = ns_id_to_nodename[(ns, _id)]
            else:
                ns_id_to_nodename[(ns, _id)] = node

            # Add famplex edge
            for pns, pid in bio_ontology.get_parents(ns, _id):
                puri = get_identifiers_url(pns, pid)
                pnode = pid
                if ns_id_to_nodename.get((pns, pid), None):
                    pnode = ns_id_to_nodename[(pns, pid)]
                else:
                    ns_id_to_nodename[(pns, pid)] = pnode
                # Check if edge already exists
                if (node, pnode, puri) not in added_pairs:
                    entities += 1
                    # Belief and evidence are conditional
                    added_pairs.add((node, pnode, puri))  # A, B, uri of B
                    ed = {
                        'agA_name': node,
                        'agA_ns': ns,
                        'agA_id': _id,
                        'agB_name': pnode,
                        'agB_ns': pns,
                        'agB_id': pid,
                        'stmt_type': 'fplx',
                        'evidence_count': 1,
                        'source_counts': {
                            'fplx': 1
                        },
                        'stmt_hash': puri,
                        'belief': 1.0,
                        'weight': MIN_WEIGHT,
                        'curated': True,
                        'english': f'{pns}:{pid} is an ontological parent '
                        f'of {ns}:{_id}',
                        'z_score': 0,
                        'corr_weight': 1
                    }
                    # Add non-existing nodes
                    if ed['agA_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agA_name'],
                                                ns=ed['agA_ns'],
                                                id=ed['agA_id'])
                    if ed['agB_name'] not in indranet_graph.nodes:
                        indranet_graph.add_node(ed['agB_name'],
                                                ns=ed['agB_ns'],
                                                id=ed['agB_id'])
                    # Add edges
                    ed.pop('agA_id')
                    ed.pop('agA_ns')
                    ed.pop('agB_id')
                    ed.pop('agB_ns')
                    if indranet_graph.is_multigraph():
                        # MultiDiGraph
                        indranet_graph.add_edge(ed['agA_name'], ed['agB_name'],
                                                **ed)
                    else:
                        # DiGraph
                        u = ed.pop('agA_name')
                        v = ed.pop('agB_name')

                        # Check edge
                        if indranet_graph.has_edge(u, v):
                            indranet_graph.edges[(u,
                                                  v)]['statements'].append(ed)
                        else:
                            indranet_graph.add_edge(u,
                                                    v,
                                                    belief=1.0,
                                                    weight=1.0,
                                                    statements=[ed],
                                                    **z_sc_attrs)

        logger.info('Loaded %d entity relations into dataframe' % entities)
        indranet_graph.graph['node_by_uri'] = node_by_uri
    indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename
    indranet_graph.graph['edge_by_hash'] = hash_edge_dict
    indranet_graph.graph['date'] = date
    return indranet_graph
Ejemplo n.º 18
0
    # Get average Jaccard index per drug
    jaccard_ranking = []
    for name, jvs in jaccard_index.items():
        li, lu, ljr = list(zip(*jvs))
        jaccard_ranking.append(
            (name, sum(ljr) / len(ljr), sum(li) / len(li), sum(lu) / len(lu)))
    jaccard_ranking.sort(key=lambda t: t[1], reverse=True)
    df = pd.DataFrame(
        data=jaccard_ranking,
        columns=['drug', 'jaccard_index', 'n_intersection', 'n_union'])

    return global_ranking, df


if __name__ == '__main__':
    drug_file = sys.argv[1]
    try:
        sample_size = sys.argv[2]
    except IndexError:
        sample_size = None
    drug_expl = file_opener(drug_file)
    assert isinstance(drug_expl, DepMapExplainer)
    overall_ranking, jaccard_df_per_drug = \
        get_rankings_per_drug(drug_expl.expl_df)
    jaccard_df_per_pair = get_jaccard_rankings_per_pair(
        drug_expl.expl_df, drug_expl.stats_df)

    logger.info('Done with script, results are in variables '
                '`overall_ranking`, `jaccard_df_per_drug` and '
                '`jaccard_df_per_pair`')