def load_reactome( self, ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]]: """Load and return the reactome data used in script The loaded data is expected to be a tuple or list of dicts. The first dict is expected to contain mappings from UP IDs of genes to Reactome pathway IDs. The second dict is expected to contain the reverse mapping (i.e Reactome IDs to UP IDs). The third dict is expected to contain mappings from the Reactome IDs to their descriptions. Returns ------- Tuple[Dict[str, List[str]], Dict[str, List[str]], Dict[str, str]] """ if self.reactome_filepath is not None: reactome = file_opener(self.reactome_filepath) assert isinstance(reactome, (tuple, list)), ( f"{self.reactome_filepath} does not seem to contain tuple " f"of (upid - pathway mapping, pathway - upid mapping, " f"pathway id - pathway description).") else: raise FileNotFoundError( "No reactome file location seems to be present in script settings." ) return reactome
def _loop_explainers(expl_path: str): # Store explainer data by their graph type expl_by_type = {'pybel': [], 'signed': [], 'unsigned': []} for explainer_file in tqdm(get_dir_iter(expl_path, '.pkl')): expl: DepMapExplainer = file_opener(explainer_file) expl_data = _get_expl_data(expl) expl_by_type[expl.script_settings['graph_type']].append(expl_data) return expl_by_type
def _get_raw(raw: PathObj): if isinstance(raw, str): logger.info(f'Reading raw DepMap data from {raw}') # a) raw_df = io.file_opener(raw, index_col=0) elif isinstance(raw, pd.DataFrame): raw_df = raw else: raw_df = None return raw_df
def load_graph(self) -> Union[nx.DiGraph, nx.MultiDiGraph]: """Load and return the graph used in script Returns ------- Union[nx.DiGraph, nx.MultiDiGraph] """ graph = file_opener(self.graph_filepath) assert isinstance(graph, (nx.DiGraph, nx.MultiDiGraph)) return graph
def drugs_to_corr_matrix(raw_file: str, info_file: str): """Preprocess and create a correlation matrix from raw drug data Parameters ---------- raw_file : str Path to DepMap PRISM drug repurposing data file. Should match primary-screen-replicate-collapsed-logfold-change.csv info_file : str Path to DepMap PRISM drug repurposing info file. Should match primary-screen-replicate-collapsed-treatment-info.csv """ def _get_drug_name(drug_id): drug_rec = info_df.loc[drug_id] return drug_rec['name'] raw_df: pd.DataFrame = io.file_opener(raw_file, index_col=0) info_df: pd.DataFrame = io.file_opener(info_file, index_col=0) col_names = [_get_drug_name(did) for did in raw_df.columns] raw_df.columns = col_names return raw_depmap_to_corr(raw_df)
def test_reactome_expl(): up2path, _, pathid2pathname = file_opener( 's3://depmap-analysis/misc_files/reactome_pathways.pkl') reactome_dict = {'uniprot_mapping': up2path, 'pathid_name_mapping': pathid2pathname} react_func: Callable = expl_functions[react_funcname] up1 = 'A0A075B6P5' up2 = 'A5LHX3' res = {'R-HSA-2871837'} descr = ['FCERI mediated NF-kB activation'] assert res == set(up2path[up1]) & set(up2path[up2]) hgnc_id1 = reverse_uniprot[up1] hgnc_name1 = hgnc_names[hgnc_id1] hgnc_id2 = reverse_uniprot[up2] hgnc_name2 = hgnc_names[hgnc_id2] func_args = (hgnc_name1, hgnc_name2, 0.0, nx.DiGraph(), 'unsigned', reactome_dict) s, o, explained, data = react_func(*func_args) assert explained assert data == descr, str(data or 'None returned')
parser.add_argument('--z-corr', type=file_path('h5'), required=True, help='The path to the stored correlation matrix as ' 'a pandas DataFrame') parser.add_argument('--reactome', type=file_path('pkl'), required=True, help='The reactome pickle') args = parser.parse_args() z_sc_file = Path(args.z_corr) reactome_file = Path(args.reactome) sd_ranges = [('rnd', None), (2, 3), (3, 4), (4, 5), (5, None)] # Only need first dict reactome_mapping = file_opener(reactome_file)[0] # Load corr matrix z_sc_full = pd.read_hdf(z_sc_file) assert isinstance(z_sc_full, pd.DataFrame) all_stats = { 'range': [], 'checked': [], 'has_pathways': [], 'has_pathways_norm': [] } data_frames = {} for ll, ul in sd_ranges: # Filter matrix
indexer = count(0) processed_explainers = [] for explainer_file in input_iter: logger.info( f'> > > > ' f'Processing {explainer_file} ' f'{datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")} (UTC)' f' < < < <' ) explainer_out = _joinpath( output_dir, str(explainer_file).split('/')[-1].split('.')[0] ) logger.info(f'Saving output to {explainer_out}') if not dry: # Load pickle explainer = file_opener(explainer_file) try: assert isinstance(explainer, DepMapExplainer) except AssertionError: logger.warning(f'File {explainer_file} is not ' f'DepMapExplainer, skipping...') continue # Backwards compatibility: check if s3_location attribute # exists, otherwise set it and then re-upload. If attribute # exists but is not set, set it and re-upload try: if not explainer.s3_location: _save(fpath=explainer_file, expl_inst=explainer) except AttributeError: _save(fpath=explainer_file, expl_inst=explainer)
def get_corr_stats_axb( self, z_corr: Optional[Union[str, pd.DataFrame]] = None, max_proc: Optional[int] = None, max_so_pairs_size: int = 10000, mp_pairs: bool = True, run_linear: bool = False, ) -> Results: """Get statistics of the correlations from different explanation types Note: the provided options have no effect if the data is loaded from cache. Parameters ---------- z_corr : Optional[Union[pd.DataFrame, str]] A pd.DataFrame containing the correlation z scores used to create the statistics in this object. Pro max_proc : int > 0 The maximum number of processes to run in the multiprocessing in get_corr_stats_mp. Default: multiprocessing.cpu_count() max_so_pairs_size : int The maximum number of correlation pairs to process. If the number of eligible pairs is larger than this number, a random sample of max_so_pairs_size is used. Default: 10 000. If the number of pairs to check is smaller than 10 000, no sampling is done. mp_pairs : bool If True, get the pairs to process using multiprocessing if larger than 10 000. Default: True. run_linear : bool If True, gather the data without multiprocessing. This option is good when debugging or if the environment for some reason does not support multiprocessing. Default: False. Returns ------- Results A BaseModel containing correlation data for different explanations """ if not self.corr_stats_axb: s3 = get_s3_client(unsigned=False) try: corr_stats_loc = self.get_s3_corr_stats_path() if S3Path.from_string(corr_stats_loc).exists(s3): logger.info(f"Found corr stats data at {corr_stats_loc}") corr_stats_json = file_opener(corr_stats_loc) self.corr_stats_axb = Results(**corr_stats_json) else: logger.info(f"No corr stats data at found at " f"{corr_stats_loc}") except ValueError as ve: # Raised when s3 location is not set logger.warning(ve) # If not found on s3 or ValueError was raised if not self.corr_stats_axb: logger.info("Generating corr stats data") # Load correlation matrix if z_corr is None: z_corr = self.load_z_corr() if isinstance(z_corr, str): z_corr = self.load_z_corr(local_file_path=z_corr) # Load reactome if present try: reactome = self.load_reactome() except FileNotFoundError: logger.info("No reactome file used in script") reactome = None self.corr_stats_axb: Results = axb_stats( self.expl_df, self.stats_df, z_corr=z_corr, reactome=reactome, eval_str=False, max_proc=max_proc, max_corr_pairs=max_so_pairs_size, do_mp_pairs=mp_pairs, run_linear=run_linear, ) try: corr_stats_loc = self.get_s3_corr_stats_path() logger.info(f"Uploading corr stats to S3 at " f"{corr_stats_loc}") s3p_loc = S3Path.from_string(corr_stats_loc) s3p_loc.put(s3=s3, body=self.corr_stats_axb.json()) logger.info("Finished uploading corr stats to S3") except ValueError: logger.warning("Unable to upload corr stats to S3") else: logger.info("Data already present in corr_stats_axb") return self.corr_stats_axb
def main(indra_net: str, z_score: str, outname: str, graph_type: str, sd_range: Tuple[float, Union[None, float]], random: bool = False, raw_data: Optional[List[str]] = None, raw_corr: Optional[List[str]] = None, expl_funcs: Optional[List[str]] = None, pb_node_mapping: Optional[Dict[str, Set]] = None, n_chunks: Optional[int] = 256, is_a_part_of: Optional[List[str]] = None, immediate_only: Optional[bool] = False, return_unexplained: Optional[bool] = False, reactome_path: Optional[str] = None, subset_list: Optional[List[Union[str, int]]] = None, apriori_explained: Optional[Union[bool, str]] = False, allowed_ns: Optional[List[str]] = None, allowed_sources: Optional[List[str]] = None, info: Optional[Dict[Hashable, Any]] = None, indra_date: Optional[str] = None, depmap_date: Optional[str] = None, sample_size: Optional[int] = None, shuffle: Optional[bool] = False, overwrite: Optional[bool] = False, normalize_names: Optional[bool] = False, argparse_dict: Optional[Dict[str, Union[str, float, int, List[str]]]] = None): """Set up correlation matching of depmap data with an indranet graph Parameters ---------- indra_net : Union[nx.DiGraph, nx.MultiDiGraph] The graph representation of the indra network. Each edge should have an attribute named 'statements' containing a list of sources supporting that edge. If signed search, indranet is expected to be an nx.MultiDiGraph with edges keyed by (gene, gene, sign) tuples. outname : str A file path (can be an S3 url) to where to store the final pickle file containing the DepmapExplainer graph_type : str The graph type of the graph used for the explanations. Can be one of 'unsigned', 'signed', 'pybel'. sd_range : Tuple[float, Union[float, None]] A tuple of the lower and optionally the upper bound of the z-score range to use when getting correlations random : bool Whether to do a random sampling or not. If True do a random sample instead of cutting the correlations of to the given SD range. z_score : Union[pd.DataFrame, str] The path to the correlation DataFrame. If either raw data or raw corr are used, this filepath will be used to save the resulting DataFrame instead. raw_data : Optional[List[str]] File paths to CRISPR raw data and RNAi raw data from the DepMap Portal raw_corr : Optional[List[str]] File paths to raw correlation data (before z-score conversion) containing hdf compressed correlation data. These files contain the result of running `raw_df.corr()`. expl_funcs : Optional[List[str]] Provide a list of explanation functions to apply. Default: All functions are applied. Currently available functions: - 'expl_ab': Explain pair by checking for an edge between a and b - 'expl_ba': Explain pair by checking for an edge between b and a - 'expl_axb': Explain pair by looking for intermediate nodes connecting a to b - 'expl_bxa': Explain pair by looking for intermediate nodes connecting b to a - 'get_sr': Explain pair by finding common upstream nodes - 'get_st': Explain pair by finding common downstream nodes - 'get_sd': Explain pair by finding common downstream nodes two edges from s and o - 'find_cp': Explain pair by looking for ontological parents - 'apriori_explained': Map entities to a-priori explanations - 'common_reactome_paths': Explain pair by matching common reactome pathways pb_node_mapping : Optional[Union[Dict, Set[Any]]] If graph type is "pybel", use this argument to provide a mapping from HGNC symbols to pybel nodes in the pybel model n_chunks : Optional[int] How many chunks to split the data into in the multiprocessing part of the script is_a_part_of : Optional[Iterable] A set of identifiers to look for when applying the common parent explanation between a pair of correlating nodes. immediate_only : Optional[bool] Only look for immediate parents. This option might limit the number of results that are returned. Default: False. return_unexplained : Optional[bool] If True: return explanation data even if there is no set intersection of nodes up- or downstream of A, B for shared regulators and shared targets. Default: False. reactome_path : Optional[str] File path to reactome data. subset_list : Optional[List[Union[str, int]]] Provide a list if entities that defines a subset of the entities in the correlation data frame that will be picked as 'a' when the pairs (a, b) are generated apriori_explained : Optional[str] A mapping from entity names to a string containing a short explanation of why the entity is explained. To use the default MitoCarta 3.0 file, run the following code: >>> from depmap_analysis.scripts.depmap_script2 import mito_file >>> from depmap_analysis.preprocessing import get_mitocarta_info >>> apriori_mapping = get_mitocarta_info(mito_file) then pass `apriori_mapping` as `apriori_explained` when calling this function: >>> main(apriori_explained=apriori_mapping, ...) allowed_ns : Optional[List[str]] A list of allowed name spaces for explanations involving intermediary nodes. Default: Any namespace. allowed_sources : Optional[List[str]] The allowed sources for edges. This will not affect subsequent edges in explanations involving 2 or more edges. Default: all sources are allowed. info : Optional[Dict[Hashable, Any]] An optional dict in which to save meta data about this run indra_date : Optional[str] The date of the sif dump used to create the graph depmap_date : Optional[str] The date (usually a quarter e.g. 19Q4) the depmap data was published on depmap.org sample_size : Optional[int] Number of correlation pairs to approximately get out of the correlation matrix after down sampling it shuffle : Optional[bool] If True, shuffle the correlation matrix. This is good to do in case the input data have some sort of structure that could lead to large discrepancies between compute times for the different processes. Default: False. overwrite : Optional[bool] If True, overwrite any output files. Default: False. normalize_names : Optional[bool] If True, try to normalize the names in the correlation matrix that are not found in the provided graph. Default: False. argparse_dict : Optional[Dict[str, Union[str, float, int, List[str]]]] Provide the argparse options from running this file as a script """ global indranet, hgnc_node_mapping, output_list indranet = file_opener(indra_net) assert isinstance(indranet, nx.DiGraph) assert expl_funcs is None or isinstance(expl_funcs, (list, tuple, set)) # 1 Check options sd_l, sd_u = sd_range if sd_range and len(sd_range) == 2 else \ ((sd_range[0], None) if sd_range and len(sd_range) == 1 else (None, None)) if not random and not sd_l and not sd_u: raise ValueError('Must specify at least a lower bound for the SD ' 'range or flag run for random explanation') if graph_type == 'pybel' and not pb_node_mapping: raise ValueError('Must provide PyBEL node mapping with option ' 'pb_node_mapping if graph type is "pybel"') if apriori_explained: if apriori_explained is True or mito_file_name in apriori_explained: # Run default apriori_explained = get_mitocarta_info(mito_file) else: # Hope it's a csv/tsv try: expl_df = pd.read_csv(apriori_explained) apriori_explained = { e: d for e, d in zip(expl_df.name, expl_df.description) } except Exception as err: raise ValueError('A-priori explained entities must be in a ' 'file that can be parsed as CSV/TSV with ' 'column names "name" for entity name and ' '"description" for explanation why the ' 'entity is explained.') \ from err logger.info(f'Using explained set with ' f'{len(apriori_explained)} entities') outname = outname if outname.endswith('.pkl') else \ outname + '.pkl' if not overwrite: if outname.startswith('s3://'): s3 = get_s3_client(unsigned=False) if S3Path.from_string(outname).exists(s3): raise FileExistsError(f'File {str(outname)} already exists!') elif Path(outname).is_file(): raise FileExistsError(f'File {str(outname)} already exists!') if z_score is not None and Path(z_score).is_file(): z_corr = pd.read_hdf(z_score) else: z_sc_options = { 'crispr_raw': raw_data[0], 'rnai_raw': raw_data[1], 'crispr_corr': raw_corr[0], 'rnai_corr': raw_corr[1], 'z_corr_path': z_score } z_corr = run_corr_merge(**z_sc_options) if reactome_path: up2path, _, pathid2pathname = file_opener(reactome_path) reactome_dict = { 'uniprot_mapping': up2path, 'pathid_name_mapping': pathid2pathname } else: reactome_dict = None # Get mapping of correlation names to pybel nodes if graph_type == 'pybel': if isinstance(pb_node_mapping, dict): hgnc_node_mapping = pb_node_mapping elif isinstance(pb_node_mapping, str) and \ Path(pb_node_mapping).is_file(): hgnc_node_mapping = file_opener(pb_node_mapping) else: raise ValueError('Could not load pybel node mapping') # 2. Filter to SD range OR run random sampling if random: logger.info('Doing random sampling through df.sample') z_corr = z_corr.sample(142, axis=0) z_corr = z_corr.filter(list(z_corr.index), axis=1) # Remove correlation values to not confuse with real data z_corr.loc[:, :] = 0 else: if sd_l and sd_u: logger.info(f'Filtering correlations to {sd_l} - {sd_u} SD') z_corr = z_corr[((z_corr > sd_l) & (z_corr < sd_u)) | ((z_corr < -sd_l) & (z_corr > -sd_u))] elif isinstance(sd_l, (int, float)) and sd_l and not sd_u: logger.info(f'Filtering correlations to {sd_l}+ SD') z_corr = z_corr[(z_corr > sd_l) | (z_corr < -sd_l)] sd_range = (sd_l, sd_u) if sd_u else (sd_l, None) # Pick a sample if sample_size is not None and not random: logger.info(f'Reducing correlation matrix to a random approximately ' f'{sample_size} correlation pairs.') z_corr = down_sample_df(z_corr, sample_size) # Shuffle corr matrix without removing items elif shuffle and not random: logger.info('Shuffling correlation matrix...') z_corr = z_corr.sample(frac=1, axis=0) z_corr = z_corr.filter(list(z_corr.index), axis=1) if normalize_names: logger.info('Normalizing correlation matrix column names') z_corr = normalize_corr_names(z_corr, indranet) else: logger.info('Leaving correlation matrix column names as is') # 4. Add meta data info_dict = {} if info: info_dict['info'] = info # Set the script_settings script_settings = { 'raw_data': raw_data, 'raw_corr': raw_corr, 'z_score': z_score, 'random': random, 'indranet': indra_net, 'shuffle': shuffle, 'sample_size': sample_size, 'n_chunks': n_chunks, 'outname': outname, 'apriori_explained': apriori_explained if isinstance(apriori_explained, str) else 'no info', 'graph_type': graph_type, 'pybel_node_mapping': pb_node_mapping if isinstance(pb_node_mapping, str) else 'no info', 'argparse_info': argparse_dict } # Create output list in global scope output_list = [] explanations = match_correlations(corr_z=z_corr, sd_range=sd_range, script_settings=script_settings, graph_filepath=indra_net, z_corr_filepath=z_score, apriori_explained=apriori_explained, graph_type=graph_type, allowed_ns=allowed_ns, allowed_sources=allowed_sources, is_a_part_of=is_a_part_of, expl_funcs=expl_funcs, reactome_filepath=reactome_path, indra_date=indra_date, info=info_dict, depmap_date=depmap_date, n_chunks=n_chunks, immediate_only=immediate_only, return_unexplained=return_unexplained, reactome_dict=reactome_dict, subset_list=subset_list) if outname.startswith('s3://'): try: logger.info(f'Uploading results to s3: {outname}') s3 = get_s3_client(unsigned=False) s3outpath = S3Path.from_string(outname) explanations.s3_location = s3outpath.to_string() s3outpath.upload(s3=s3, body=pickle.dumps(explanations)) logger.info('Finished uploading results to s3') except Exception: new_path = Path(outname.replace('s3://', '')) logger.warning(f'Something went wrong in s3 upload, trying to ' f'save locally instead to {new_path}') new_path.parent.mkdir(parents=True, exist_ok=True) dump_it_to_pickle(fname=new_path.absolute().resolve().as_posix(), pyobj=explanations, overwrite=overwrite) else: # mkdir in case it doesn't exist outpath = Path(outname) logger.info(f'Dumping results to {outpath}') outpath.parent.mkdir(parents=True, exist_ok=True) dump_it_to_pickle(fname=outpath.absolute().resolve().as_posix(), pyobj=explanations, overwrite=overwrite) logger.info('Script finished') explanations.summarize()
def run_corr_merge(crispr_raw: Optional[Union[str, pd.DataFrame]] = None, rnai_raw: Optional[Union[str, pd.DataFrame]] = None, crispr_corr: Optional[Union[str, pd.DataFrame]] = None, rnai_corr: Optional[Union[str, pd.DataFrame]] = None, output_dir: str = 'correlation_output', remove_self_corr: bool = False, random_sampl: int = 0, save_corr_files: bool = False, z_corr_path: Optional[str] = None): """Return a merged correlation matrix from DepMap data Start with with either the raw DepMap files or pre-calculated correlation matrices Parameters ---------- crispr_raw : str|pd.DataFrame Path to the raw crispr data. This file is typically named 'Achilles_gene_effect.csv' at the DepMap portal. rnai_raw : str|pd.DataFrame Path to the raw RNAi data. This file is typically named 'D2_combined_gene_dep_scores.csv' crispr_corr : str|pd.DataFrame Path to the pre-calculated crispr data matrix. This data structure is the result from running `crispr_raw_df.corr()`. rnai_corr : str|pd.DataFrame Path to the pre-calculated rnai data matrix. This data structure is the result from running `rnai_raw_df.corr()`. output_dir : str If used, write the correlation matrices to this directory. Otherwise they will be written to the same directory as the raw input data. remove_self_corr : bool If True, remove self correlations from the resulting DataFrame. Default: False random_sampl : int If specified, provides the size of the final correlation matrix where the genes are picked at random from the intersection of genes from both the RNAI and CRISPR data sets. save_corr_files : bool If True, save the intermediate correlation data frames for both crispr and rnai. Default: True. z_corr_path : Optional[str] If provided, save the final correlation dataframe here Returns ------- pd.DataFrame A data frame containing the combined z-score matrix with NaN's removed. """ if crispr_raw is None and crispr_corr is None: raise ValueError('Need to provide one of crispr_raw or cripsr_corr') if rnai_raw is None and rnai_corr is None: raise ValueError('Need to provide one of rnai_raw or rnai_corr') # First check for correlation matrix, then get it if it doesn't exist if crispr_corr: if isinstance(crispr_corr, str): logger.info(f'Reading crispr correlations from file {crispr_corr}') crispr_corr_df = pd.read_hdf(crispr_corr) else: crispr_corr_df = crispr_corr else: # Create new one, write to input file's directory if isinstance(crispr_raw, str): logger.info(f'Reading raw DepMap data from {crispr_raw}') crispr_raw_df = io.file_opener(crispr_raw, index_col=0) else: crispr_raw_df = crispr_raw crispr_corr_df = raw_depmap_to_corr(crispr_raw_df, split_names=True, dropna=False) if save_corr_files: crispr_fpath = Path(output_dir).joinpath( '_crispr_all_correlations.h5') logger.info(f'Saving crispr correlation matrix to {crispr_fpath}') if not crispr_fpath.parent.is_dir(): crispr_fpath.parent.mkdir(parents=True, exist_ok=True) crispr_corr_df.to_hdf(crispr_fpath.absolute(), 'corr') if rnai_corr: if isinstance(rnai_corr, str): logger.info(f'Reading rnai correlations from file {crispr_corr}') rnai_corr_df = pd.read_hdf(rnai_corr) else: rnai_corr_df = rnai_corr else: # Create new one, write to input file's directory if isinstance(rnai_raw, str): logger.info(f'Reading raw DepMap data from {rnai_raw}') rnai_raw_df = io.file_opener(rnai_raw, index_col=0) else: rnai_raw_df = rnai_raw # Check if we need to transpose the df if len(set(crispr_corr_df.columns.values) & set([n.split()[0] for n in rnai_raw_df.columns])) == 0: logger.info('Transposing RNAi raw data dataframe...') rnai_raw_df = rnai_raw_df.T rnai_corr_df = raw_depmap_to_corr(rnai_raw_df, split_names=True, dropna=False) if save_corr_files: rnai_fpath = Path(output_dir).joinpath('_rnai_all_correlations.h5') if not rnai_fpath.parent.is_dir(): rnai_fpath.parent.mkdir(parents=True, exist_ok=True) logger.info(f'Saving rnai correlation matrix to {rnai_fpath}') rnai_corr_df.to_hdf(rnai_fpath.absolute().as_posix(), 'corr') # Merge the correlation matrices z_cm = merge_corr_df(crispr_corr_df, rnai_corr_df, remove_self_corr) if random_sampl and random_sampl < len(z_cm.columns): # Get n random rows z_cm = z_cm.sample(n=random_sampl) # Make square z_cm = z_cm[list(z_cm.index.values)] assert z_cm.notna().sum().sum() > 0, 'Correlation matrix is empty' if z_corr_path: zc_path = Path(z_corr_path) zc_path.parent.mkdir(parents=True, exist_ok=True) z_cm.to_hdf(zc_path) return z_cm
type=int, default=1, help="Set the number of chunks to split the data into to run " "multiprocessing. If set to e.g. 4, 4 workers will be started " "to run async with multiprocessing.Pool.apply_async. If set to " "1 (default), no multiprocessing will be used.", ) args = parser.parse_args() # defaults not up to user: # - shuffle the stuff # - explanation functions # Load graph graph = file_opener(args.graph) # Load corr logger.info(f"Loading z-score dataframe {args.z_score}") z_corr = pd.read_hdf(args.z_score) logger.info("Done loading dataframe") # Set kwargs kwargs = dict( indra_net=graph, z_score=z_corr, graph_type=args.graph_type, expl_funcs=[ "apriori_explained", "common_reactome_paths", "find_cp",
def test_depmap_script(): up2path, _, pathid2pathname = file_opener( 's3://depmap-analysis/misc_files/reactome_pathways.pkl') reactome_dict = { 'uniprot_mapping': up2path, 'pathid_name_mapping': pathid2pathname } df = get_df() idg = get_dg() up1 = 'A0A075B6P5' up2 = 'A5LHX3' hgnc_id1 = reverse_uniprot[up1] hgnc_name1 = hgnc_names[hgnc_id1] hgnc_id2 = reverse_uniprot[up2] hgnc_name2 = hgnc_names[hgnc_id2] idg.add_node(hgnc_name1, ns='HGNC', id=hgnc_id1) idg.add_node(hgnc_name2, ns='HGNC', id=hgnc_id2) not_in_graph = 'not_in_graph' # Make correlation matrix with all combinations from the df pairs all_names = list(set(df.agA_name.values) | set(df.agB_name.values)) + \ [not_in_graph, hgnc_name1, hgnc_name2] all_names.sort() corr_m = _gen_sym_df(len(all_names)) corr_m.columns = all_names corr_m.index = all_names func_names = [ 'expl_ab', 'expl_ba', 'expl_axb', 'expl_bxa', 'get_sr', 'get_st', react_funcname ] func_map = { funcname_to_colname[fname]: expl_functions[fname] for fname in func_names } bool_columns = ('not_in_graph', 'explained') + tuple(func_map.keys()) stats_columns = id_columns + bool_columns _type = 'unsigned' corr_pairs = corr_matrix_to_generator(corr_m) stats_dict, expl_dict = _match_correlation_body( corr_iter=corr_pairs, expl_types=func_map, stats_columns=stats_columns, expl_cols=expl_columns, bool_columns=bool_columns, _type=_type, return_unexplained=False, reactome_dict=reactome_dict, local_indranet=idg, apriori_explained=None) assert set(stats_columns) == set(stats_dict.keys()) assert set(expl_columns) == set(expl_dict.keys()) expl_df = pd.DataFrame(expl_dict) stats_df = pd.DataFrame(stats_dict) # Test content # Any connection with not_in_graph should be assert all(b for b in stats_df[(stats_df.agA == not_in_graph) | (stats_df.agB == not_in_graph)].not_in_graph), \ str([b for b in stats_df[(stats_df.agA == not_in_graph) | (stats_df.agB == not_in_graph)].not_in_graph]) assert all( np.isnan(b) for b in stats_df[(stats_df.agA == not_in_graph) | (stats_df.agB == not_in_graph)].explained) expected = { 'not_in_graph': False, 'explained': True, ab_colname: False, ba_colname: False, axb_colname: False, # Not True, as pairs go alphabetically bxa_colname: True, # True from testing Y2,Z2 sr_colname: False, st_colname: False, react_colname: False } p = 'Y2_Z2' res = stats_df[list(bool_columns)][stats_df.pair == p].to_dict( orient='records')[0] for k, b in res.items(): assert b == expected[k] assert expl_df[(expl_df.pair == p) & ( expl_df.expl_type == bxa_colname)].expl_data.values[0] == ['X2'] assert len(expl_df[(expl_df.pair == p) & (expl_df.expl_type == sr_colname)]) == 0 assert len(expl_df[(expl_df.pair == p) & (expl_df.expl_type == st_colname)]) == 0 expected = { 'not_in_graph': False, 'explained': True, ab_colname: False, ba_colname: False, axb_colname: False, bxa_colname: False, sr_colname: True, st_colname: True, react_colname: False } p = 'X1_X2' res: Dict = stats_df[list(bool_columns)][stats_df.pair == p].to_dict( orient='records')[0] for k, b in res.items(): assert b == expected[k] assert expl_df[(expl_df.pair == p) & ( expl_df.expl_type == sr_colname)].expl_data.values[0][2] == ['Z2'] assert expl_df[(expl_df.pair == p) & ( expl_df.expl_type == st_colname)].expl_data.values[0][2] == ['Z1'] # Check that reactome is explained, and not counted as among the explained len_react = len(stats_df[stats_df[react_colname] == True]) assert len_react == 1, len_react len_react = len(stats_df[(stats_df[react_colname] == True) & (stats_df.explained == False)]) assert len_react == 1, len_react # Test getting interesting df interesting_df = get_non_reactome_axb_expl_df(graph=idg, stats_df=stats_df, expl_df=expl_df, z_corr=corr_m) assert len(interesting_df) == 5 assert set(interesting_df.pair) == {'X1_X2', 'Y1_Z2', 'Y2_Z2', 'Z1_Z2'}
status='pending', fname=meta_name, location=meta_loc, result_location=result_loc) logger.info(f'Updating {qh} to pending') bgt.add_task(upload_json, job_status) bgt.add_task(handle_query, search_query, job_status) return job_status # Change to 'online' after everything is loaded # asyncio.sleep(5) # Simulate loading something <- not allowed, can't await # outside async function if ROLE == 'UNSIGNED': logger.info('Assuming role as unsigned worker') indra_graph = file_opener(FILES['dir_graph']) if FILES['dir_graph'] else\ None if isinstance(indra_graph, (DiGraph, MultiGraph)): STATUS.graph_stats['unsigned_edges'] = len(indra_graph.edges) STATUS.graph_stats['unsigned_nodes'] = len(indra_graph.nodes) network_search_api = IndraNetwork(indra_dir_graph=indra_graph) network_search_api.verbose = 1 elif ROLE == 'SIGNED': logger.info('Assuming role as signed worker') indra_seg = file_opener( FILES['sign_edge_graph']) if FILES['sign_edge_graph'] else None if isinstance(indra_seg, (DiGraph, MultiGraph)): STATUS.graph_stats['signed_edge_edges'] = len(indra_seg.edges) STATUS.graph_stats['signed_edge_nodes'] = len(indra_seg.nodes) indra_sng = file_opener(FILES['sign_node_graph']) if indra_seg and \ FILES['sign_node_graph'] else None
def sif_dump_df_merger(df: pd.DataFrame, graph_type: str, sign_dict: Optional[Dict[str, int]] = None, stmt_types: Optional[List[str]] = None, mesh_id_dict: Optional[Dict[str, str]] = None, set_weights: bool = True, verbosity: int = 0): """Merge the sif dump df with the provided dictionaries Parameters ---------- df : str|pd.DataFrame A dataframe, either as a file path to a pickle or csv, or a pandas DataFrame object. graph_type : str If 'signed-expanded' or 'digraph-signed-types', do extra filtering or alteration to the DataFrame to produce an expanded signed graph or a reduced digraph with only the signed types sign_dict : Optional[Dict[str, int]] A dictionary mapping a Statement type to a sign to be used for the edge. By default only Activation and IncreaseAmount are added as positive edges and Inhibition and DecreaseAmount are added as negative edges, but a user can pass any other Statement types in a dictionary. stmt_types : Optional[List[str]] Provide a list of statement types to be used if expanding the signed graph to include statements of these types mesh_id_dict : dict A dict object mapping statement hashes to all mesh ids sharing a common PMID set_weights : bool If True, set the edge weights. Default: True. verbosity : int Output various extra messages if > 1. Returns ------- pd.DataFrame A pandas DataFrame with new columns from the merge """ if isinstance(df, str): merged_df = file_opener(df) else: merged_df = df if 'hash' in merged_df.columns: merged_df.rename(columns={'hash': 'stmt_hash'}, inplace=True) # Extend df with these columns: # english string from mock statements # mesh_id mapped by dict (if provided) # z-score values (if provided) # Extend df with famplex rows # 'stmt_hash' must exist as column in the input dataframe for merge to work # Preserve all rows in merged_df, so do left join: # merged_df.merge(other, how='left', on='stmt_hash') if graph_type == 'signed-expanded' and sign_dict and stmt_types: merged_df = expand_signed(merged_df, sign_dict, stmt_types) elif graph_type == 'signed-expanded' and not (sign_dict and stmt_types): raise ValueError('Must provide statement types using variable ' '`stmt_types` to run signed_expanded graph') if mesh_id_dict is not None: hashes = [] mesh_ids = [] for k, v in mesh_id_dict.items(): hashes.append(int(k)) mesh_ids.append(v) merged_df = merged_df.merge(right=pd.DataFrame(data={ 'stmt_hash': hashes, 'mesh_ids': mesh_ids }), how='left', on='stmt_hash') # Check for missing hashes if merged_df['source_counts'].isna().sum() > 0: logger.warning('%d rows with missing evidence found' % merged_df['source_counts'].isna().sum()) if verbosity > 1: logger.info('Missing hashes in stratified evidence dict: %s' % list(merged_df['stmt_hash'][ merged_df['source_counts'].isna() == True])) logger.info('Setting "curated" flag') # Map to boolean 'curated' for reader/non-reader merged_df['curated'] = merged_df['source_counts'].apply(func=_curated_func) # Make english statement merged_df['english'] = merged_df.apply(_english_from_row, axis=1) if set_weights: logger.info('Setting edge weights') # Add weight: -log(belief) or 1/evidence count if no belief has_belief = (merged_df['belief'].isna() == False) has_no_belief = (merged_df['belief'].isna() == True) merged_df['weight'] = 0 if has_belief.sum() > 0: merged_df.loc[has_belief, 'weight'] = merged_df['belief'].apply( func=_weight_from_belief) if has_no_belief.sum() > 0: merged_df.loc[has_no_belief, 'weight'] = \ merged_df['evidence_count'].apply( func=lambda ec: 1/np.longfloat(ec)) else: logger.info('Skipping setting belief weight') return merged_df
# 'or provide node mapping with option ' # 'if graph type is pybel') # # Only model provided: create mapping # if arg_dict.get('pybel_model') and \ # not arg_dict.get('pybel_node_mapping'): # mapping = pybel_node_name_mapping( # node_names=hgnc_names, node_ns='HGNC', # pb_model=file_opener(arg_dict['pybel_model']) # ) # arg_dict['pb_node_mapping'] = mapping # # Mapping is provided: load the mapping # elif arg_dict.get('pybel_node_mapping'): # if arg_dict['pybel_node_mapping'].endswith('.pkl'): # arg_dict['pb_node_mapping'] = \ # file_opener(arg_dict['pybel_node_mapping']) # elif arg_dict['pybel_node_mapping'].endswith('.json'): # arg_dict['pb_node_mapping'] = \ # file_opener(arg_dict['pybel_node_mapping']) # else: # raise ValueError('Unknown file type %s' % # arg_dict['pybel_node_mapping'].split('.')[-1]) if args.subset_list: df: pd.DataFrame = file_opener(args.subset_list) arg_dict['subset_list'] = list(df.name.values) main_keys = inspect.signature(main).parameters.keys() kwargs = {k: v for k, v in arg_dict.items() if k in main_keys} main(**kwargs)
def sif_dump_df_to_digraph(df: Union[pd.DataFrame, str], date: str, mesh_id_dict: Optional[Dict] = None, graph_type: GraphTypes = 'digraph', include_entity_hierarchies: bool = True, sign_dict: Optional[Dict[str, int]] = None, stmt_types: Optional[List[str]] = None, z_sc_path: Optional[Union[str, pd.DataFrame]] = None, verbosity: int = 0) \ -> Union[DiGraph, MultiDiGraph, Tuple[MultiDiGraph, DiGraph]]: """Return a NetworkX digraph from a pandas dataframe of a db dump Parameters ---------- df : Union[str, pd.DataFrame] A dataframe, either as a file path to a file (.pkl or .csv) or a pandas DataFrame object. date : str A date string specifying when the data was dumped from the database. mesh_id_dict : dict A dict object mapping statement hashes to all mesh ids sharing a common PMID graph_type : str Return type for the returned graph. Currently supports: - 'digraph': DiGraph (Default) - 'multidigraph': MultiDiGraph - 'signed': Tuple[DiGraph, MultiDiGraph] - 'signed-expanded': Tuple[DiGraph, MultiDiGraph] - 'digraph-signed-types': DiGraph include_entity_hierarchies : bool If True, add edges between nodes if they are related ontologically with stmt type 'fplx': e.g. BRCA1 is in the BRCA family, so an edge is added between the nodes BRCA and BRCA1. Default: True. Note that this option only is available for the options directed/unsigned graph and multidigraph. sign_dict : Dict[str, int] A dictionary mapping a Statement type to a sign to be used for the edge. By default only Activation and IncreaseAmount are added as positive edges and Inhibition and DecreaseAmount are added as negative edges, but a user can pass any other Statement types in a dictionary. stmt_types : List[str] A list of statement types to epxand out to other signs z_sc_path: If provided, must be or be path to a square dataframe with HGNC symbols as names on the axes and floats as entries verbosity: int Output various messages if > 0. For all messages, set to 4. Returns ------- Union[DiGraph, MultiDiGraph, Tuple[DiGraph, MultiDiGraph]] The type is determined by the graph_type argument """ graph_options = ('digraph', 'multidigraph', 'signed', 'signed-expanded', 'digraph-signed-types') if graph_type.lower() not in graph_options: raise ValueError(f'Graph type {graph_type} not supported. Can only ' f'chose between {graph_options}') sign_dict = sign_dict if sign_dict else default_sign_dict graph_type = graph_type.lower() date = date if date else datetime.now().strftime('%Y-%m-%d') if isinstance(df, str): sif_df = file_opener(df) else: sif_df = df if z_sc_path is not None: if isinstance(z_sc_path, str): if z_sc_path.endswith('h5'): logger.info(f'Loading z-scores from {z_sc_path}') z_sc_df = pd.read_hdf(z_sc_path) elif z_sc_path.endswith('pkl'): logger.info(f'Loading z-scores from {z_sc_path}') z_sc_df: pd.DataFrame = file_opener(z_sc_path) else: raise ValueError(f'Unrecognized file: {z_sc_path}') elif isinstance(z_sc_path, pd.DataFrame): z_sc_df = z_sc_path else: raise ValueError('Only file paths and data frames allowed as ' 'arguments to z_sc_path') else: z_sc_df = None # If signed types: filter out rows that of unsigned types if graph_type == 'digraph-signed-types': sif_df = sif_df[sif_df.stmt_type.isin(sign_dict.keys())] sif_df = sif_dump_df_merger(sif_df, graph_type, sign_dict, stmt_types, mesh_id_dict, verbosity=verbosity) # Map ns:id to node name logger.info('Creating dictionary mapping (ns,id) to node name') ns_id_name_tups = set(zip( sif_df.agA_ns, sif_df.agA_id, sif_df.agA_name)).union( set(zip(sif_df.agB_ns, sif_df.agB_id, sif_df.agB_name))) ns_id_to_nodename = {(ns, _id): name for ns, _id, name in ns_id_name_tups} # Map hashes to edge for non-signed graphs if graph_type in {'multidigraph', 'digraph', 'digraph-signed-types'}: logger.info('Creating dictionary mapping hashes to edges for ' 'unsigned graph') hash_edge_dict = { h: (a, b) for a, b, h in zip(sif_df.agA_name, sif_df.agB_name, sif_df.stmt_hash) } # Create graph from df if graph_type == 'multidigraph': indranet_graph = IndraNet.from_df(sif_df) elif graph_type in ('digraph', 'digraph-signed-types'): # Flatten indranet_graph = IndraNet.digraph_from_df(sif_df, 'complementary_belief', _weight_mapping) elif graph_type in ('signed', 'signed-expanded'): signed_edge_graph: MultiDiGraph = IndraNet.signed_from_df( df=sif_df, flattening_method='complementary_belief', weight_mapping=_weight_mapping) signed_node_graph: DiGraph = signed_edges_to_signed_nodes( graph=signed_edge_graph, copy_edge_data=True) signed_edge_graph.graph['date'] = date signed_node_graph.graph['date'] = date signed_edge_graph.graph['node_by_ns_id'] = ns_id_to_nodename signed_node_graph.graph['node_by_ns_id'] = ns_id_to_nodename # Get hash to signed edge mapping logger.info('Creating dictionary mapping hashes to edges for ' 'unsigned graph') seg_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set) for edge in signed_edge_graph.edges: for es in signed_edge_graph.edges[edge]['statements']: if graph_type == 'signed': seg_hash_edge_dict[es['stmt_hash']] = edge else: seg_hash_edge_dict[es['stmt_hash']].add(edge) signed_edge_graph.graph['edge_by_hash'] = seg_hash_edge_dict sng_hash_edge_dict = {} if graph_type == 'signed' else defaultdict(set) for edge in signed_node_graph.edges: for es in signed_node_graph.edges[edge]['statements']: if graph_type == 'signed': sng_hash_edge_dict[es['stmt_hash']] = edge else: sng_hash_edge_dict[es['stmt_hash']].add(edge) signed_node_graph.graph['edge_by_hash'] = sng_hash_edge_dict if z_sc_df is not None: # Set z-score attributes add_corr_to_edges(graph=signed_edge_graph, z_corr=z_sc_df) add_corr_to_edges(graph=signed_node_graph, z_corr=z_sc_df) return signed_edge_graph, signed_node_graph else: raise ValueError(f'Unrecognized graph type {graph_type}. Must be one ' f'of: {", ".join(graph_options)}') if z_sc_df is not None: # Set z-score attributes add_corr_to_edges(graph=indranet_graph, z_corr=z_sc_df) # Add hierarchy relations to graph (not applicable for signed graphs) if include_entity_hierarchies and graph_type in ('multidigraph', 'digraph'): from depmap_analysis.network_functions.famplex_functions import \ get_all_entities logger.info('Fetching entity hierarchy relationships') full_entity_list = get_all_entities() logger.info('Adding entity hierarchy manager as graph attribute') node_by_uri = {uri: _id for (ns, _id, uri) in full_entity_list} added_pairs = set() # Save (A, B, URI) logger.info('Building entity relations to be added to data frame') entities = 0 non_corr_weight = None if z_sc_df is not None: # Get non-corr weight for edge in indranet_graph.edges: if indranet_graph.edges[edge]['z_score'] == 0: non_corr_weight = indranet_graph.edges[edge]['corr_weight'] break assert non_corr_weight is not None z_sc_attrs = {'z_score': 0, 'corr_weight': non_corr_weight} else: z_sc_attrs = {} for ns, _id, uri in full_entity_list: node = _id # Get name in case it's different than id if ns_id_to_nodename.get((ns, _id), None): node = ns_id_to_nodename[(ns, _id)] else: ns_id_to_nodename[(ns, _id)] = node # Add famplex edge for pns, pid in bio_ontology.get_parents(ns, _id): puri = get_identifiers_url(pns, pid) pnode = pid if ns_id_to_nodename.get((pns, pid), None): pnode = ns_id_to_nodename[(pns, pid)] else: ns_id_to_nodename[(pns, pid)] = pnode # Check if edge already exists if (node, pnode, puri) not in added_pairs: entities += 1 # Belief and evidence are conditional added_pairs.add((node, pnode, puri)) # A, B, uri of B ed = { 'agA_name': node, 'agA_ns': ns, 'agA_id': _id, 'agB_name': pnode, 'agB_ns': pns, 'agB_id': pid, 'stmt_type': 'fplx', 'evidence_count': 1, 'source_counts': { 'fplx': 1 }, 'stmt_hash': puri, 'belief': 1.0, 'weight': MIN_WEIGHT, 'curated': True, 'english': f'{pns}:{pid} is an ontological parent ' f'of {ns}:{_id}', 'z_score': 0, 'corr_weight': 1 } # Add non-existing nodes if ed['agA_name'] not in indranet_graph.nodes: indranet_graph.add_node(ed['agA_name'], ns=ed['agA_ns'], id=ed['agA_id']) if ed['agB_name'] not in indranet_graph.nodes: indranet_graph.add_node(ed['agB_name'], ns=ed['agB_ns'], id=ed['agB_id']) # Add edges ed.pop('agA_id') ed.pop('agA_ns') ed.pop('agB_id') ed.pop('agB_ns') if indranet_graph.is_multigraph(): # MultiDiGraph indranet_graph.add_edge(ed['agA_name'], ed['agB_name'], **ed) else: # DiGraph u = ed.pop('agA_name') v = ed.pop('agB_name') # Check edge if indranet_graph.has_edge(u, v): indranet_graph.edges[(u, v)]['statements'].append(ed) else: indranet_graph.add_edge(u, v, belief=1.0, weight=1.0, statements=[ed], **z_sc_attrs) logger.info('Loaded %d entity relations into dataframe' % entities) indranet_graph.graph['node_by_uri'] = node_by_uri indranet_graph.graph['node_by_ns_id'] = ns_id_to_nodename indranet_graph.graph['edge_by_hash'] = hash_edge_dict indranet_graph.graph['date'] = date return indranet_graph
# Get average Jaccard index per drug jaccard_ranking = [] for name, jvs in jaccard_index.items(): li, lu, ljr = list(zip(*jvs)) jaccard_ranking.append( (name, sum(ljr) / len(ljr), sum(li) / len(li), sum(lu) / len(lu))) jaccard_ranking.sort(key=lambda t: t[1], reverse=True) df = pd.DataFrame( data=jaccard_ranking, columns=['drug', 'jaccard_index', 'n_intersection', 'n_union']) return global_ranking, df if __name__ == '__main__': drug_file = sys.argv[1] try: sample_size = sys.argv[2] except IndexError: sample_size = None drug_expl = file_opener(drug_file) assert isinstance(drug_expl, DepMapExplainer) overall_ranking, jaccard_df_per_drug = \ get_rankings_per_drug(drug_expl.expl_df) jaccard_df_per_pair = get_jaccard_rankings_per_pair( drug_expl.expl_df, drug_expl.stats_df) logger.info('Done with script, results are in variables ' '`overall_ranking`, `jaccard_df_per_drug` and ' '`jaccard_df_per_pair`')