def get_reactome_graph(reactome_manager, reactome_dir, file): # Load BELGraph pathway_graph = from_pickle(os.path.join(reactome_dir, file)) # Check if pathway has children to build the merge graph pathway_id = file.strip('.pickle') # Look up in Bio2BEL Reactome pathway = reactome_manager.get_pathway_by_id(pathway_id) # Log if it is not present if not pathway: logger.warning(f'{pathway_id} not found in database') # Check if there are children and merge them on the fly for child in yield_all_children(pathway): child_file_path = os.path.join(reactome_dir, f"{child.resource_id}.pickle") if not os.path.exists(child_file_path): logger.warning(f'{child.resource_id} pickle does not exist') continue # Load the pickle and union it child_graph = pybel.from_pickle(child_file_path) pathway_graph += child_graph # Normalize graph names normalize_graph_names(pathway_graph, REACTOME) return pathway_graph
def import_from_pickle(manager, folder, files, database): """Import folder with pickles into database. :param pathme_viewer.manager.Manager manager: PathMe manager :param str folder: folder to be imported :param iter[str] files: iterator with file names :param str database: resource name """ for file_name in tqdm.tqdm( files, desc='Loading {} pickles to populate PathMe database'.format( database)): file_path = os.path.join(folder, file_name) bel_pathway = from_pickle(file_path) pathway_id = os.path.splitext(file_name)[0] # KEGG files have a special format (prefix: unflatten/flatten needs to be removed) if database == KEGG: pathway_id = pathway_id.split('_')[0] pathway_dict = _prepare_pathway_model(pathway_id, database, bel_pathway) _ = manager.get_or_create_pathway(pathway_dict) log.info('%s has been loaded', database)
def upload_recursive(directory, connection=None, exclude_directory_pattern=None): """Recursively uploads all gpickles in a given directory and sub-directories :param str directory: the directory to traverse :param connection: A connection string or manager :type connection: Optional[str or pybel.manage.Manager] :param Optional[str] exclude_directory_pattern: Any directory names to exclude """ manager = Manager.ensure(connection) paths = list( get_paths_recursive( directory, extension='.gpickle', exclude_directory_pattern=exclude_directory_pattern)) log.info('Paths to upload: %s', paths) for path in paths: try: network = from_pickle(path) except (ImportError, ImportVersionWarning): log.warning( '%s uses a pickle from an old version of PyBEL. Quitting.', path) continue to_database(network, connection=manager, store_parts=True)
def get_kegg_genes_from_pickles(resource_folder, files: List[str], manager) -> Dict[str, Set]: """Get BEL graph gene set for all KEGG pathways. :param str resource_folder: path to resource folder :param list files: list of BEL graph pickles :param bio2bel Manager manager: Manager :return: BEL graph gene sets for each KEGG pathway :rtype: dict[str,set] """ pathway_genes_dict = {} for file_name in files: # Flattened graphs considered for gene sets if file_name.endswith('_flatten.pickle'): graph = from_pickle(os.path.join(resource_folder, file_name)) # Get gene set for pathway gene_set = get_genes_in_graph(graph) file_name = file_name[:-len('_flatten.pickle')] file_name = 'path:' + file_name file_name = manager.get_pathway_by_id(file_name) pathway_genes_dict[str(file_name)] = gene_set return pathway_genes_dict
def test_example_pickle(self): """Test the round-trip through a pickle.""" bio = BytesIO() to_pickle(sialic_acid_graph, bio) bio.seek(0) graph = from_pickle(bio) self._help_test_equal(graph)
def upload_jgf_directory(directory: str, manager: Manager): """Upload CBN data to edge store.""" if not (os.path.exists(directory) and os.path.isdir(directory)): logger.warning('directory does not exist: %s', directory) return t = time.time() for path in iter_jgf(directory): gpickle_path = get_jgf_corresponding_gpickle_path(path) if os.path.exists(gpickle_path): graph = from_pickle(gpickle_path) strip_annotations(graph) else: with open(path) as f: cbn_jgif_dict = json.load(f) graph = pybel.from_cbn_jgif(cbn_jgif_dict) strip_annotations(graph) to_pickle(graph, gpickle_path) try: insert_graph(manager, graph, public=True, use_tqdm=True) except OperationalError: manager.session.rollback() logger.info('could not insert %s', graph) logger.info('done in %.2f seconds', time.time() - t)
def get_combined_graph_similarity( *, fullgraph_path=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE, chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE, mapping_file=DEFAULT_MAPPING_PATH, new_graph_path=DEFAULT_GRAPH_PATH, pickle_graph_path=DEFAULT_FULLGRAPH_PICKLE, rebuild: bool = False): """Combine chemical similarity graph with the fullgraph.""" if not rebuild and os.path.exists(DEFAULT_GRAPH_PATH): return nx.read_edgelist(DEFAULT_GRAPH_PATH) if type(fullgraph_path) == pybel.struct.graph.BELGraph: fullgraph_without_chemsim = fullgraph_path else: fullgraph_without_chemsim = pybel.from_pickle(fullgraph_path) if type(chemsim_graph_path) == pybel.struct.graph.BELGraph: chemsim_graph = chemsim_graph_path else: chemsim_graph = pybel.from_pickle(chemsim_graph_path) mapping_df = pd.read_csv( mapping_file, sep="\t", dtype={ 'identifier': str, 'node_id': str }, index_col=False, ) fullgraph_with_chemsim = fullgraph_without_chemsim + chemsim_graph pybel.to_pickle(fullgraph_with_chemsim, pickle_graph_path) relabel_graph = {} for ind, row in mapping_df.iterrows(): if row['namespace'] == PUBCHEM_NAMESPACE: relabel_graph[pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=row['identifier'])] = \ row['node_id'] elif row['namespace'] == UNIPROT_NAMESPACE: relabel_graph[pybel.dsl.Protein(namespace=UNIPROT_NAMESPACE, identifier=row['identifier'], name=row['name'])] = row['node_id'] else: relabel_graph[pybel.dsl.Pathology(namespace='umls', identifier=row['identifier'], name=row['name'])] = \ row['node_id'] nx.relabel_nodes(fullgraph_with_chemsim, relabel_graph, copy=False) nx.write_edgelist(fullgraph_with_chemsim, new_graph_path, data=False) return fullgraph_with_chemsim
def get_nodes_in_database(folder): """Merge all python pickles in a given folder and returns the corresponding BELGraph.""" database_networks = [ pybel.from_pickle(os.path.join(folder, path)) for path in os.listdir(folder) if path.endswith('.pickle') ] return {node for network in database_networks for node in network.nodes()}
def get_graph_by_manager( module: Union[str, ModuleType, BELManagerMixin, Type[BELManagerMixin]], force: bool = False, to_bel_kwargs: Optional[Mapping[str, Any]] = None, ) -> BELGraph: """Get a graph for a manager.""" if isinstance(module, str): # get the cache or import that module _pickle_path = os.path.join(RESOURCES, f'{module}.bel.pickle') if os.path.exists(_pickle_path) and not force: logger.info(f'Getting {module} from pickle at {_pickle_path}') return from_pickle(_pickle_path) module_name = f'bio2bel_{module}' _module = importlib.import_module(module_name) manager = _module.Manager() elif isinstance(module, BELManagerMixin): manager = module elif isinstance(module, ModuleType): manager = module.Manager() elif isinstance(module, type): if not issubclass(module, BELManagerMixin): raise TypeError(f'{module} is not a subclass of BELManagerMixin') manager = module() else: raise TypeError(f'{module} has invalid type: {type(module)}') pickle_path = os.path.join(RESOURCES, f'{manager.module_name}.bel.pickle') if os.path.exists(pickle_path) and not force: logger.info( f'Getting {manager.module_name} from pickle at {pickle_path}') return from_pickle(pickle_path) if not manager.is_populated(): logger.info(f'Populating manager for {manager.module_name}') manager.populate() graph = manager.to_bel(**(to_bel_kwargs or {})) logger.info(graph.summary_str()) logger.info(str(count_namespaces(graph))) logger.info(str(count_functions(graph))) logger.info(f'Writing pickle for {pickle_path}') to_pickle(graph, pickle_path) return graph
def iter_from_pickles(paths): """Iterates over the pickled BEL graphs in a directory :param iter[str] paths: :rtype: iter[pybel.BELGraph] """ for path in paths: if not path.endswith('.gpickle'): log.info('not a gpickle: %s', path) continue yield from_pickle(path)
def summarize(export_folder): """Summarize the KEGG export.""" click.echo('loading KEGG graphs') graphs = [ from_pickle(os.path.join(export_folder, fname)) for fname in tqdm(get_paths_in_folder(export_folder)) ] if graphs: summarize_helper(graphs) else: click.echo("Please export KEGG to BEL first. Run 'python3 -m pathme kegg bel' ")
def summarize(export_folder): """Summarize the WikiPathways export.""" click.echo('loading WikiPathways graphs') graphs = [ from_pickle(os.path.join(export_folder, fname)) for fname in tqdm(get_paths_in_folder(export_folder)) ] if graphs: summarize_helper(graphs) else: click.echo("Please export WikiPathways to BEL first. Run 'python3 -m pathme wikipathways bel' ")
def upload(path, connection, recursive, skip_check_version, to_service, service_url, debug): """Quick uploader""" set_debug_param(debug) if recursive: log.info('uploading recursively from: %s', path) upload_recursive(path, connection=connection) else: graph = from_pickle(path, check_version=(not skip_check_version)) if to_service: receiver_service.post(graph, service_url) else: to_database(graph, connection=connection)
def _iterate_kegg(kegg_pickle_paths, kegg_path, flatten, normalize_names): for path in tqdm(kegg_pickle_paths, desc=f'Loading KEGG pickles from {kegg_path}'): if not path.endswith('.pickle'): continue graph = from_pickle(os.path.join(kegg_path, path), check_version=False) if flatten: flatten_complex_nodes(graph) if normalize_names: normalize_graph_names(graph, KEGG) _update_graph(graph, path, KEGG) yield KEGG, path, graph
def upload_recursive(directory, connection=None, store_parts=False): """Recursively uploads all gpickles in a given directory and sub-directories :param str directory: the directory to traverse :param connection: A connection string or manager :type connection: None or str or pybel.manage.CacheManager :param bool store_parts: Should the edge store be used? """ manager = build_manager(connection) paths = list(get_paths_recursive(directory, extension='.gpickle')) log.info('Paths to upload: %s', paths) for path in paths: graph = from_pickle(path) safe_upload(manager, graph, store_parts=store_parts)
def get_drugbank_graph(rebuild: bool = False, **kwargs) -> pybel.BELGraph: """Get the DrugBank graph.""" if not rebuild and os.path.exists(DEFAULT_DRUGBANK_PICKLE): return pybel.from_pickle(DEFAULT_DRUGBANK_PICKLE) import bio2bel_drugbank drugbank_manager = bio2bel_drugbank.Manager() if not drugbank_manager.is_populated(): drugbank_manager.populate() drugbank_graph = drugbank_manager.to_bel(**kwargs) if os.path.exists(RESOURCES): pybel.to_pickle(drugbank_graph, DEFAULT_DRUGBANK_PICKLE) return drugbank_graph
def get_sider_graph(rebuild: bool = False) -> pybel.BELGraph: """Get the SIDER graph.""" if not rebuild and os.path.exists(DEFAULT_SIDER_PICKLE): return pybel.from_pickle(DEFAULT_SIDER_PICKLE) import bio2bel_sider sider_manager = bio2bel_sider.Manager() if not sider_manager.is_populated(): sider_manager.populate() sider_graph = sider_manager.to_bel() if os.path.exists(RESOURCES): pybel.to_pickle(sider_graph, DEFAULT_SIDER_PICKLE) return sider_graph
def upload_neurommsig_graphs(manager: Manager): """Only upload NeuroMMSig Sample Networks.""" if not (os.path.exists(alzheimer_directory) and os.path.isdir(alzheimer_directory)): logger.warning('directory does not exist: %s', alzheimer_directory) return if not os.path.exists(neurommsig_directory): logger.info('created neurommsig directory: %s', neurommsig_directory) os.makedirs(neurommsig_directory) path = os.path.join(alzheimer_directory, 'alzheimers.bel') gpickle_path = os.path.join(alzheimer_directory, 'alzheimers.gpickle') if os.path.exists(gpickle_path): graph = from_pickle(gpickle_path) elif os.path.exists(path): graph = from_bel_script(path, manager=manager) to_pickle(graph, gpickle_path) else: raise RuntimeError('missing NeuroMMSig source file: {}'.format(path)) subgraphs = { name: subgraph for name, subgraph in get_subgraphs_by_annotation( graph, annotation='Subgraph').items() if name in neurommsig_sample_networks } networks = [] for subgraph_name, subgraph in subgraphs.items(): subgraph.name = 'NeuroMMSig AD {}'.format(subgraph_name) subgraph.authors = 'Daniel Domingo-Fernandez et. al' subgraph.version = graph.version subgraph.license = graph.license # output to directory as gpickle to_pickle( subgraph, os.path.join(neurommsig_directory, '{}.gpickle'.format(subgraph_name))) network = insert_graph(manager, subgraph, public=True, use_tqdm=True) networks.append(network) write_manifest(neurommsig_directory, networks)
def _iterate_wp(wp_pickle_paths, wikipathways_path, flatten, normalize_names): for path in tqdm(wp_pickle_paths, desc=f'Loading WP pickles from {wikipathways_path}'): if not path.endswith('.pickle'): continue graph = from_pickle(os.path.join(wikipathways_path, path), check_version=False) if flatten: flatten_complex_nodes(graph) if normalize_names: normalize_graph_names(graph, WIKIPATHWAYS) _update_graph(graph, path, WIKIPATHWAYS) yield WIKIPATHWAYS, path, graph
def _iterate_reactome(reactome_pickle_paths, reactome_path, flatten, normalize_names): for file in tqdm(reactome_pickle_paths, desc=f'Loading Reactome pickles from {reactome_path}'): if not file.endswith('.pickle'): continue graph = from_pickle(os.path.join(reactome_path, file), check_version=False) if flatten: flatten_complex_nodes(graph) if normalize_names: normalize_graph_names(graph, REACTOME) _update_graph(graph, file, REACTOME) yield REACTOME, file, graph
def upload(manager, path, skip_check_version, to_service, service_url, exclude_directory_pattern, debug): """Upload gpickles""" set_debug_param(debug) if os.path.isdir(path): log.info('uploading recursively from: %s', path) upload_recursive(path, connection=manager, exclude_directory_pattern=exclude_directory_pattern) elif os.path.isfile(path): from pybel import from_pickle graph = from_pickle(path, check_version=(not skip_check_version)) if to_service: from pybel import to_web to_web(graph, service_url) else: from pybel import to_database to_database(graph, connection=manager, store_parts=True)
def get_genes_from_pickles(resource_folder: str, files: List[str], manager) -> Dict[str, set]: """Get BEL graph gene set for all pathways in resource. :param resource_folder: path to resource folder :param list files: list of BEL graph pickles :param bio2bel Manager manager: Manager :return: BEL graph gene sets for each pathway in resource :rtype: dict[str,set] """ pathway_genes_dict = {} for file_name in files: graph = from_pickle(os.path.join(resource_folder, file_name)) # Get gene set for pathway gene_set = get_genes_in_graph(graph) file_name = file_name[:-len('.pickle')] file_name = manager.get_pathway_by_id(file_name) pathway_genes_dict[str(file_name)] = gene_set return pathway_genes_dict
def process_pybel_network(network_type, network_file, **kwargs): """Return PybelProcessor by processing a given network file. Parameters ---------- network_type : str The type of network that network_file is. The options are: belscript, json, cbn_jgif, graph_pickle, and graph_jsongz_url. Default: graph_jsongz_url network_file : str Path to the network file/URL to process. Returns ------- bp : PybelProcessor A PybelProcessor object which contains INDRA Statements in bp.statements. """ if network_type == 'belscript': return process_belscript(network_file, **kwargs) elif network_type == 'json': return process_json_file(network_file) elif network_type == 'cbn_jgif': return process_cbn_jgif_file(network_file) elif network_type == 'graph_jsongz_url': if not network_file: network_file = large_corpus_url logger.info('Loading %s' % network_file) res = requests.get(network_file) res.raise_for_status() contentb = zlib.decompress(res.content, zlib.MAX_WBITS | 32) content = contentb.decode('utf-8') graph = pybel.from_nodelink_jsons(content) return process_pybel_graph(graph) elif network_type == 'graph_pickle': graph = pybel.from_pickle(network_file) return process_pybel_graph(graph) else: raise ValueError('Unknown network type: %s' % network_type)
def get_bel_types(path: str): """Get BEL node and edge type statistics. :param path: path to pickle :return: count of all nodes and edges in a BEL graph :rtype: dict """ bel_stats = {} bel_graph = from_pickle(path) bel_stats['nodes'] = bel_graph.number_of_nodes() bel_stats['edges'] = bel_graph.number_of_edges() # Get count of all BEL function types bel_functions_dict = count_functions(bel_graph) bel_stats.update(bel_functions_dict) # Get count of all BEL edge types bel_edges_dict = count_relations(bel_graph) bel_stats.update(bel_edges_dict) return bel_stats
def main(connection): """Parse a network, load it to the database, then test how fast it drops.""" manager = pybel.Manager(connection) if os.path.exists(PICKLE): print(f'opening from {PICKLE}') graph = pybel.from_pickle(PICKLE) else: with time_me(f'opening from {SMALL_CORPUS_URL}'): graph = pybel.from_url(SMALL_CORPUS_URL, manager=manager, use_tqdm=True, citation_clearing=False) pybel.to_pickle(graph, PICKLE) n = 1 # FIXME this fails if you do it with the same manager times = [ get_numbers(graph, manager) for _ in range(n) ] print(times) print(sum(times) / n)
import pybel from pybel.struct.filters import has_protein_modification from indra.sources import bel from indra.sources.bel.processor import get_agent from .util import get_mod_sites if __name__ == '__main__': # Parse the BEL script, takes a few minutes if sys.argv[1] == 'parse_belscript': input_file = sys.argv[2] output_file = sys.argv[3] pbg = pybel.from_path(input_file) pybel.to_pickle(pbg, output_file) # Get all variant sites from the graph #elif sys.argv[1] == 'get_pybel_mod_agents': # pbg = pybel.from_pickle('output/large_corpus_pybel.pkl') # mod_nodes = [get_agent(n) for n in pbg.nodes() # if has_protein_modification(n)] # with open('output/bel_mod_agents.pkl', 'wb') as f: # pickle.dump(mod_nodes, f) elif sys.argv[1] == 'get_pybel_stmts_by_site': input_file = sys.argv[2] output_file = sys.argv[3] pbg = pybel.from_pickle(input_file) pbp = bel.process_pybel_graph(pbg) sites = get_mod_sites(pbp.statements) with open(output_file, 'wb') as f: pickle.dump(sites, f) else: sys.exit(1)
def test_thorough_pickle(self): bio = BytesIO() to_pickle(self.thorough_graph, bio) bio.seek(0) graph = from_pickle(bio) self.bel_thorough_reconstituted(graph)
def test_example_pickle(self): bio = BytesIO() to_pickle(sialic_acid_graph, bio) bio.seek(0) graph = from_pickle(bio) self.help_test_equal(graph)
def get_wp_graph(file): pathway_graph = from_pickle(file) normalize_graph_names(pathway_graph, WIKIPATHWAYS) return pathway_graph
def get_kegg_graph(file): pathway_graph = from_pickle(file) normalize_graph_names(pathway_graph, KEGG) return pathway_graph