def test_example_pickle(self): """Test the round-trip through a pickle.""" bio = BytesIO() to_pickle(sialic_acid_graph, bio) bio.seek(0) graph = from_pickle(bio) self._help_test_equal(graph)
def upload_jgf_directory(directory: str, manager: Manager): """Upload CBN data to edge store.""" if not (os.path.exists(directory) and os.path.isdir(directory)): logger.warning('directory does not exist: %s', directory) return t = time.time() for path in iter_jgf(directory): gpickle_path = get_jgf_corresponding_gpickle_path(path) if os.path.exists(gpickle_path): graph = from_pickle(gpickle_path) strip_annotations(graph) else: with open(path) as f: cbn_jgif_dict = json.load(f) graph = pybel.from_cbn_jgif(cbn_jgif_dict) strip_annotations(graph) to_pickle(graph, gpickle_path) try: insert_graph(manager, graph, public=True, use_tqdm=True) except OperationalError: manager.session.rollback() logger.info('could not insert %s', graph) logger.info('done in %.2f seconds', time.time() - t)
def reactome_to_bel(resource_file: str, hgnc_manager, chebi_manager, export_folder=REACTOME_BEL): """Create Reactome BEL graphs. :param resource_file: rdf reactome file (there is only one) :param bio2bel_hgnc.Manager hgnc_manager: uniprot id to hgnc symbol dictionary :return: """ logger.info('Parsing Reactome RDF file') rdf_graph = parse_rdf(resource_file, fmt='xml') pathways_uris_to_names = rdf_graph.query(GET_ALL_PATHWAYS, initNs=PREFIXES) for pathway_uri, _pathway_name in tqdm( pathways_uris_to_names, desc=f'Exporting Reactome BEL to {export_folder}'): # Take the identifier of the pathway which is placed at the end of the URL and also strip the number # next to it. (probably version of pathway) file_name = pathway_uri.split('/')[-1].split('.')[0] pickle_file = os.path.join(export_folder, f'{file_name}.pickle') # Skip if BEL file already exists if os.path.exists(pickle_file): continue bel_graph = reactome_pathway_to_bel(pathway_uri, rdf_graph, hgnc_manager, chebi_manager) # Export BELGraph to pickle to_pickle(bel_graph, pickle_file)
def get_drugbank_graph(rebuild: bool = False, **kwargs) -> pybel.BELGraph: """Get the DrugBank graph.""" if not rebuild and os.path.exists(DEFAULT_DRUGBANK_PICKLE): return pybel.from_pickle(DEFAULT_DRUGBANK_PICKLE) import bio2bel_drugbank drugbank_manager = bio2bel_drugbank.Manager() if not drugbank_manager.is_populated(): drugbank_manager.populate() drugbank_graph = drugbank_manager.to_bel(**kwargs) if os.path.exists(RESOURCES): pybel.to_pickle(drugbank_graph, DEFAULT_DRUGBANK_PICKLE) return drugbank_graph
def get_sider_graph(rebuild: bool = False) -> pybel.BELGraph: """Get the SIDER graph.""" if not rebuild and os.path.exists(DEFAULT_SIDER_PICKLE): return pybel.from_pickle(DEFAULT_SIDER_PICKLE) import bio2bel_sider sider_manager = bio2bel_sider.Manager() if not sider_manager.is_populated(): sider_manager.populate() sider_graph = sider_manager.to_bel() if os.path.exists(RESOURCES): pybel.to_pickle(sider_graph, DEFAULT_SIDER_PICKLE) return sider_graph
def upload_neurommsig_graphs(manager: Manager): """Only upload NeuroMMSig Sample Networks.""" if not (os.path.exists(alzheimer_directory) and os.path.isdir(alzheimer_directory)): logger.warning('directory does not exist: %s', alzheimer_directory) return if not os.path.exists(neurommsig_directory): logger.info('created neurommsig directory: %s', neurommsig_directory) os.makedirs(neurommsig_directory) path = os.path.join(alzheimer_directory, 'alzheimers.bel') gpickle_path = os.path.join(alzheimer_directory, 'alzheimers.gpickle') if os.path.exists(gpickle_path): graph = from_pickle(gpickle_path) elif os.path.exists(path): graph = from_bel_script(path, manager=manager) to_pickle(graph, gpickle_path) else: raise RuntimeError('missing NeuroMMSig source file: {}'.format(path)) subgraphs = { name: subgraph for name, subgraph in get_subgraphs_by_annotation( graph, annotation='Subgraph').items() if name in neurommsig_sample_networks } networks = [] for subgraph_name, subgraph in subgraphs.items(): subgraph.name = 'NeuroMMSig AD {}'.format(subgraph_name) subgraph.authors = 'Daniel Domingo-Fernandez et. al' subgraph.version = graph.version subgraph.license = graph.license # output to directory as gpickle to_pickle( subgraph, os.path.join(neurommsig_directory, '{}.gpickle'.format(subgraph_name))) network = insert_graph(manager, subgraph, public=True, use_tqdm=True) networks.append(network) write_manifest(neurommsig_directory, networks)
def get_combined_graph_similarity( *, fullgraph_path=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE, chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE, mapping_file=DEFAULT_MAPPING_PATH, new_graph_path=DEFAULT_GRAPH_PATH, pickle_graph_path=DEFAULT_FULLGRAPH_PICKLE, rebuild: bool = False): """Combine chemical similarity graph with the fullgraph.""" if not rebuild and os.path.exists(DEFAULT_GRAPH_PATH): return nx.read_edgelist(DEFAULT_GRAPH_PATH) if type(fullgraph_path) == pybel.struct.graph.BELGraph: fullgraph_without_chemsim = fullgraph_path else: fullgraph_without_chemsim = pybel.from_pickle(fullgraph_path) if type(chemsim_graph_path) == pybel.struct.graph.BELGraph: chemsim_graph = chemsim_graph_path else: chemsim_graph = pybel.from_pickle(chemsim_graph_path) mapping_df = pd.read_csv( mapping_file, sep="\t", dtype={ 'identifier': str, 'node_id': str }, index_col=False, ) fullgraph_with_chemsim = fullgraph_without_chemsim + chemsim_graph pybel.to_pickle(fullgraph_with_chemsim, pickle_graph_path) relabel_graph = {} for ind, row in mapping_df.iterrows(): if row['namespace'] == PUBCHEM_NAMESPACE: relabel_graph[pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=row['identifier'])] = \ row['node_id'] elif row['namespace'] == UNIPROT_NAMESPACE: relabel_graph[pybel.dsl.Protein(namespace=UNIPROT_NAMESPACE, identifier=row['identifier'], name=row['name'])] = row['node_id'] else: relabel_graph[pybel.dsl.Pathology(namespace='umls', identifier=row['identifier'], name=row['name'])] = \ row['node_id'] nx.relabel_nodes(fullgraph_with_chemsim, relabel_graph, copy=False) nx.write_edgelist(fullgraph_with_chemsim, new_graph_path, data=False) return fullgraph_with_chemsim
def subgraphs_to_pickles(network, annotation, directory=None): """Groups the given graph into subgraphs by the given annotation with :func:`get_subgraph_by_annotation` and outputs them as gpickle files to the given directory with :func:`pybel.to_pickle` :param pybel.BELGraph network: A BEL network :param str annotation: An annotation to split by. Suggestion: ``Subgraph`` :param Optional[str] directory: A directory to output the pickles """ directory = directory or os.getcwd() for value in get_annotation_values(network, annotation): sg = get_subgraph_by_annotation_value(network, annotation, value) sg.document.update(network.document) file_name = '{}_{}.gpickle'.format(annotation, value.replace(' ', '_')) path = os.path.join(directory, file_name) to_pickle(sg, path)
def merge_directory(manager, directory, name, debug): """Parses all BEL files in a directory and outputs it""" set_debug_param(debug) name = name or '{}-merged.gpickle'.format(directory) path = os.path.join(directory, name) if os.path.exists(path): click.echo('Path already exists. Quitting. [{}]'.format(path)) from . import from_directory from pybel import to_pickle enable_cool_mode() graph = from_directory(directory, connection=manager) to_pickle(graph, file=path)
def get_graph_by_manager( module: Union[str, ModuleType, BELManagerMixin, Type[BELManagerMixin]], force: bool = False, to_bel_kwargs: Optional[Mapping[str, Any]] = None, ) -> BELGraph: """Get a graph for a manager.""" if isinstance(module, str): # get the cache or import that module _pickle_path = os.path.join(RESOURCES, f'{module}.bel.pickle') if os.path.exists(_pickle_path) and not force: logger.info(f'Getting {module} from pickle at {_pickle_path}') return from_pickle(_pickle_path) module_name = f'bio2bel_{module}' _module = importlib.import_module(module_name) manager = _module.Manager() elif isinstance(module, BELManagerMixin): manager = module elif isinstance(module, ModuleType): manager = module.Manager() elif isinstance(module, type): if not issubclass(module, BELManagerMixin): raise TypeError(f'{module} is not a subclass of BELManagerMixin') manager = module() else: raise TypeError(f'{module} has invalid type: {type(module)}') pickle_path = os.path.join(RESOURCES, f'{manager.module_name}.bel.pickle') if os.path.exists(pickle_path) and not force: logger.info( f'Getting {manager.module_name} from pickle at {pickle_path}') return from_pickle(pickle_path) if not manager.is_populated(): logger.info(f'Populating manager for {manager.module_name}') manager.populate() graph = manager.to_bel(**(to_bel_kwargs or {})) logger.info(graph.summary_str()) logger.info(str(count_namespaces(graph))) logger.info(str(count_functions(graph))) logger.info(f'Writing pickle for {pickle_path}') to_pickle(graph, pickle_path) return graph
def subgraphs_to_pickles(graph, directory=None, annotation='Subgraph'): """Groups the given graph into subgraphs by the given annotation with :func:`get_subgraph_by_annotation` and outputs them as gpickle files to the given directory with :func:`pybel.to_pickle` :param graph: A BEL Graph :type graph: pybel.BELGraph :param directory: A directory to output the pickles :type directory: str :param annotation: An annotation to split by. Suggestion: ``Subgraph`` :type annotation: str """ directory = os.getcwd() if directory is None else directory for value in get_annotation_values(graph, annotation=annotation): sg = get_subgraph_by_annotation_value(graph, annotation, value) sg.document.update(graph.document) file_name = '{}_{}.gpickle'.format(annotation, value.replace(' ', '_')) path = os.path.join(directory, file_name) to_pickle(sg, path)
def save_model(self, path, output_format=None): """Save the :class:`pybel.BELGraph` using one of the outputs from :py:mod:`pybel` Parameters ---------- path : str The path to output to output_format : Optional[str] Output format as ``cx``, ``pickle``, ``json`` or defaults to ``bel`` """ if output_format == 'pickle': pybel.to_pickle(self.model, path) else: with open(path, 'w') as fh: if output_format == 'json': pybel.to_nodelink_file(self.model, fh) elif output_format == 'cx': pybel.to_cx_file(self.model, fh) else: # output_format == 'bel': pybel.to_bel_script(self.model, fh)
def save_model(self, path, output_format=None): """Save the :class:`pybel.BELGraph` using one of the outputs from :py:mod:`pybel` Parameters ---------- path : str The path to output to output_format : Optional[str] Output format as ``cx``, ``pickle``, ``json`` or defaults to ``bel`` """ if output_format == 'pickle': pybel.to_pickle(self.model, path) else: with open(path, 'w') as fh: if output_format == 'json': pybel.to_json_file(self.model, fh) elif output_format == 'cx': pybel.to_cx_file(self.model, fh) else: # output_format == 'bel': pybel.to_bel(self.model, fh)
def run_one(directory): """ :param str directory: """ if not os.path.isdir(directory): return if not os.path.exists(os.path.join(directory, 'config.yaml')): return # Skip non-INDRA machine directories log.info('running in %s', directory) # Use config=None so it looks in the subdirectory run_with_search_helper(directory, config=None) default_config_fname = os.path.join(directory, 'config.yaml') config = get_config(default_config_fname) ndex_cred = get_ndex_cred(config) name = ndex_cred.get('name') if ndex_cred is not None else None model = load_model(directory) stmts = model.get_statements() if not stmts: log.warning('no statements') return # Output CX if name: cx_str = assemble_cx(stmts, name) with open(os.path.join(directory, 'output.cx'), 'w') as file: print(cx_str, file=file) # Output BEL gpickle bel_graph = pybel.from_indra_statements(stmts) pybel.to_pickle(bel_graph, os.path.join(directory, 'output.gpickle')) return True
def kegg_to_pickles(resource_files, resource_folder, hgnc_manager, chebi_manager, flatten=None, export_folder=None): """Export WikiPathways to Pickles. :param iter[str] resource_files: iterator with file names :param str resource_folder: path folder :param Optional[str] export_folder: export folder """ if export_folder is None: export_folder = resource_folder for kgml_file in tqdm.tqdm( resource_files, desc=f'Exporting KEGG to BEL in {export_folder}'): _name = kgml_file[:-len('.xml')] _flatten = 'flatten' if flatten else 'unflatten' # Name of file created will be: "hsaXXX_unflatten.pickle" or "hsaXXX_flatten.pickle" pickle_path = os.path.join( export_folder if export_folder else KEGG_BEL, f'{_name}_{_flatten}.pickle', ) # Skip not KGML files or file already exists if not kgml_file.endswith('.xml') or os.path.exists(pickle_path): continue bel_graph = kegg_to_bel( path=os.path.join(resource_folder, kgml_file), hgnc_manager=hgnc_manager, chebi_manager=chebi_manager, flatten=True if flatten else False, ) to_pickle(bel_graph, pickle_path)
def write_bel(connection, skip, directory, force): """Write all as BEL.""" os.makedirs(directory, exist_ok=True) from .manager.bel_manager import BELManagerMixin import pybel for _, name, manager in _iterate_managers(connection, skip): if not isinstance(manager, BELManagerMixin): continue click.secho(name, fg='cyan', bold=True) path = os.path.join(directory, f'{name}.bel.pickle') if os.path.exists(path) and not force: click.echo('👍 already exported') continue if not manager.is_populated(): click.echo('👎 unpopulated') else: graph = manager.to_bel() pybel.to_pickle(graph, path) pybel.to_nodelink_gz( graph, os.path.join(directory, f'{name}.bel.nodelink.json.gz')) pybel.to_bel_script_gz(graph, os.path.join(directory, f'{name}.bel.gz'))
def convert_recursive(directory, connection=None, upload=False, pickle=False, store_parts=False, enrich_citations=False): """Recursively parses and either uploads/pickles graphs in a given directory and sub-directories""" metadata_parser = build_metadata_parser(connection) paths = list(get_paths_recursive(directory)) log.info('Paths to parse: %s', paths) for path in paths: try: graph = from_path(path, manager=metadata_parser.manager) except: log.exception('Problem parsing %s', path) continue if enrich_citations: fix_pubmed_citations(graph) if upload: safe_upload(metadata_parser.manager, graph, store_parts=store_parts) if pickle: new_path = '{}.gpickle'.format(path[:-4]) # [:-4] gets rid of .bel at the end of the file name to_pickle(graph, new_path)
def wikipathways_to_pickles( resource_files: Iterable[str], resource_folder: str, hgnc_manager: bio2bel_hgnc.Manager, export_folder: str, ) -> None: """Export WikiPathways to Pickles. :param resource_files: iterator with file names :param resource_folder: path folder :param hgnc_manager: HGNC manager :param export_folder: export folder """ for rdf_file in tqdm.tqdm( resource_files, desc=f'Exporting WikiPathways to BEL in {export_folder}'): if rdf_file.endswith('.ttl'): pickle_name = rdf_file[:-len('.ttl')] else: pickle_name = rdf_file pickle_path = os.path.join(export_folder, f'{pickle_name}.pickle') # Skip if BEL file already exists # TODO: Remove pathway from blacklist if os.path.exists(pickle_path) or rdf_file in WIKIPATHWAYS_BLACKLIST: continue # Parse pathway rdf_file and logger stats pathway_path = os.path.join(resource_folder, rdf_file) bel_graph = wikipathways_to_bel(pathway_path, hgnc_manager) debug_pathway_info(bel_graph, pathway_path) # Export BELGraph to pickle to_pickle(bel_graph, pickle_path)
def main(connection): """Parse a network, load it to the database, then test how fast it drops.""" manager = pybel.Manager(connection) if os.path.exists(PICKLE): print(f'opening from {PICKLE}') graph = pybel.from_pickle(PICKLE) else: with time_me(f'opening from {SMALL_CORPUS_URL}'): graph = pybel.from_url(SMALL_CORPUS_URL, manager=manager, use_tqdm=True, citation_clearing=False) pybel.to_pickle(graph, PICKLE) n = 1 # FIXME this fails if you do it with the same manager times = [ get_numbers(graph, manager) for _ in range(n) ] print(times) print(sum(times) / n)
import sys import pickle import pybel from pybel.struct.filters import has_protein_modification from indra.sources import bel from indra.sources.bel.processor import get_agent from .util import get_mod_sites if __name__ == '__main__': # Parse the BEL script, takes a few minutes if sys.argv[1] == 'parse_belscript': input_file = sys.argv[2] output_file = sys.argv[3] pbg = pybel.from_path(input_file) pybel.to_pickle(pbg, output_file) # Get all variant sites from the graph #elif sys.argv[1] == 'get_pybel_mod_agents': # pbg = pybel.from_pickle('output/large_corpus_pybel.pkl') # mod_nodes = [get_agent(n) for n in pbg.nodes() # if has_protein_modification(n)] # with open('output/bel_mod_agents.pkl', 'wb') as f: # pickle.dump(mod_nodes, f) elif sys.argv[1] == 'get_pybel_stmts_by_site': input_file = sys.argv[2] output_file = sys.argv[3] pbg = pybel.from_pickle(input_file) pbp = bel.process_pybel_graph(pbg) sites = get_mod_sites(pbp.statements) with open(output_file, 'wb') as f: pickle.dump(sites, f) else:
def test_thorough_pickle(self): bio = BytesIO() to_pickle(self.thorough_graph, bio) bio.seek(0) graph = from_pickle(bio) self.bel_thorough_reconstituted(graph)
def test_example_pickle(self): bio = BytesIO() to_pickle(sialic_acid_graph, bio) bio.seek(0) graph = from_pickle(bio) self.help_test_equal(graph)
def get_graph( force: bool = False, force_global: bool = False, names: Optional[NamesList] = None, resources_directory: Optional[str] = None, ) -> BELGraph: """Get all resources in a combine BELGraph. :param force: Should cached files be overwritten? :param force_global: Should the global cache file be overwritten? :param names: The name of the bio2bel packages to use and arguments :param resources_directory: A non-default place to store the resources """ pickle_path = os.path.join(resources_directory or RESOURCES, CACHE_NAME) if not force_global and os.path.exists(pickle_path): logger.info(f'Getting cached full graph') return from_pickle(pickle_path) if names is None: names = DEFAULT_NAMES logger.info('Generating graphs') graphs = [] for name, to_bel_kwargs in names: _graph = get_graph_by_manager(name, force=force, to_bel_kwargs=to_bel_kwargs) logger.info(_graph.summary_str()) graphs.append(_graph) logger.info('Merging graphs') graph = pybel.union(graphs) graph.name = f'Graph from: {", ".join(graph.name for graph in graphs)}' graph.version = '0.0.1' logger.info('Finished merging graphs') logger.info('Preparing HGNC mappings') hgnc_manager = bio2bel_hgnc.Manager() hgnc_symbol_to_id = hgnc_manager.build_hgnc_symbol_id_mapping() entrez_id_to_hgnc_symbol = hgnc_manager.build_entrez_id_to_hgnc_symbol_mapping( ) logger.info('Generating namespace mapping for nodes') mapping = {} for node in graph: namespace = node.get('namespace') if namespace is None: continue elif namespace.lower() in { 'ncbigene', 'egid' } and node.identifier in entrez_id_to_hgnc_symbol: name = entrez_id_to_hgnc_symbol[node.identifier] identifier = hgnc_symbol_to_id[name] mapping[node] = node.__class__( namespace='hgnc', name=name, identifier=identifier, ) logger.info('Relabeling nodes') nx.relabel_nodes(graph, mapping, copy=False) logger.info('Enriching central dogma') enrich_protein_and_rna_origins(graph) logger.info('Exporting snp2k pickle') to_pickle(graph, pickle_path) return graph
def get_similarity_graph( *, fullgraph=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE, rebuild: bool = False, mapping_file=DEFAULT_CHEMICALS_MAPPING_PATH, chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE, clustered: bool = True, similarity=0.7, name='Chemical Similarity Graph', version='1.1.0', authors='', contact='', description='', ): """ Create a BELGraph with chemicals as nodes, and similarity as edges. :param similarity: the percent in which the chemicals are similar :param mapping_file: an existing dataframe with pubchemIDs and Smiles """ if not rebuild and os.path.exists(DEFAULT_CHEMSIM_PICKLE): return nx.read_edgelist(DEFAULT_CHEMSIM_PICKLE) if type(fullgraph) == pybel.struct.graph.BELGraph: fullgraph_without_chemsim = fullgraph else: fullgraph_without_chemsim = pybel.from_pickle(fullgraph) pubchem_ids = [] for node in fullgraph_without_chemsim.nodes(): if node.namespace != 'pubchem.compound': continue pubchem_ids.append(node.identifier) if os.path.exists(mapping_file): chemicals_mapping = pd.read_csv( mapping_file, sep="\t", dtype={ 'PubchemID': str, 'Smiles': str }, index_col=False, ) pubchem_id_to_smiles = {} new_chemicals = [] smiles = [] for pubchem_id in tqdm(pubchem_ids, desc="Getting SMILES"): if chemicals_mapping.loc[chemicals_mapping["PubchemID"] == pubchem_id].empty: chemical_smiles = cid_to_smiles(pubchem_id) if not isinstance(chemical_smiles, str): chemical_smiles = chemical_smiles.decode("utf-8") pubchem_id_to_smiles[pubchem_id] = chemical_smiles new_chemicals.append(pubchem_id) smiles.append(chemical_smiles) else: pubchem_id_to_smiles[pubchem_id] = chemicals_mapping.loc[ chemicals_mapping["PubchemID"] == pubchem_id, "Smiles"].iloc[0] new_df = pd.DataFrame({"PubchemID": new_chemicals, "Smiles": smiles}) chemicals_mapping = chemicals_mapping.append(new_df) chemicals_mapping.to_csv(mapping_file, sep='\t', index=False) else: pubchem_id_to_smiles = get_smiles(pubchem_ids) pubchem_id_to_fingerprint = get_fingerprints(pubchem_id_to_smiles) chemsim_graph = pybel.BELGraph(name, version, description, authors, contact) if clustered: clustered_df = cluster_chemicals( rebuild=True, chemicals_dict=pubchem_id_to_fingerprint) clusters = clustered_df['Cluster'].unique().tolist() for cluster in tqdm(clusters, desc='Creating similarity BELGraph'): chemicals = clustered_df.loc[clustered_df.Cluster == cluster] if len(chemicals) == 1: continue for ind, row in chemicals.iterrows(): for ind1, row1 in chemicals.iterrows(): if row['PubchemID'] == row1['PubchemID']: continue chemical_01 = pybel.dsl.Abundance( namespace='pubchem.compound', identifier=row['PubchemID']) chemical_02 = pybel.dsl.Abundance( namespace='pubchem.compound', identifier=row1['PubchemID']) if chemsim_graph.has_edge( chemical_01, chemical_02) or chemsim_graph.has_edge( chemical_02, chemical_01): continue chemsim_graph.add_unqualified_edge(chemical_01, chemical_02, 'association') else: similarities = get_similarity(pubchem_id_to_fingerprint) for (source_pubchem_id, target_pubchem_id), sim in tqdm( similarities.items(), desc='Creating similarity BELGraph'): if sim < similarity: continue chemsim_graph.add_unqualified_edge( pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=source_pubchem_id), pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=target_pubchem_id), 'association', ) pybel.to_pickle(chemsim_graph, chemsim_graph_path) return chemsim_graph
def get_combined_sider_drugbank( *, rebuild: bool = False, drugbank_graph_path=None, sider_graph_path=None, chemical_mapping=DEFAULT_CHEMICALS_MAPPING_PATH, ): """ Combine the SIDER and DrugBank graphs. :param drugbank_graph_path: the path to drugbank graph :param sider_graph_path: the path to sider graph :return: BELGraph """ if not rebuild and os.path.exists( DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE): return pybel.from_pickle(DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE) if type(sider_graph_path) == pybel.struct.graph.BELGraph: sider_graph = sider_graph_path elif sider_graph_path is not None and os.path.exists(sider_graph_path): sider_graph = pybel.from_pickle(sider_graph_path) else: sider_graph = get_sider_graph() if type(drugbank_graph_path) == pybel.struct.graph.BELGraph: drugbank_graph = drugbank_graph_path elif drugbank_graph_path is not None and os.path.exists( drugbank_graph_path): drugbank_graph = pybel.from_pickle(drugbank_graph_path) else: drugbank_graph = get_drugbank_graph() smiles_dict = {} if chemical_mapping is not None: mapping_df = pd.read_csv( chemical_mapping, sep="\t", dtype={ 'PubchemID': str, 'Smiles': str }, index_col=False, ) for node in tqdm(sider_graph.nodes()): if node.namespace != 'pubchem.compound': continue if node.identifier in mapping_df.values: smiles = mapping_df.loc[mapping_df['PubchemID'] == node.identifier, 'Smiles'].iloc[0] else: smiles = cid_to_smiles(node.identifier) if not isinstance(smiles, str): smiles = smiles.decode("utf-8") smiles_dict[node] = smiles for node in tqdm(drugbank_graph.nodes()): if node.namespace != 'pubchem.compound': continue if node in smiles_dict.keys(): continue if node.identifier in mapping_df.values: smiles = mapping_df.loc[mapping_df['PubchemID'] == node.identifier, 'Smiles'].iloc[0] else: smiles = cid_to_smiles(node.identifier) if not isinstance(smiles, str): smiles = smiles.decode("utf-8") smiles_dict[node] = smiles sider_relabeled = nx.relabel_nodes(sider_graph, smiles_dict) drugbank_relabeled = nx.relabel_nodes(drugbank_graph, smiles_dict) full_graph = sider_relabeled + drugbank_relabeled smiles_dict_rev = {v: k for k, v in smiles_dict.items()} full_graph_relabel = nx.relabel_nodes(full_graph, smiles_dict_rev) if os.path.exists(RESOURCES): pybel.to_pickle(full_graph_relabel, DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE) return full_graph_relabel
def convert_paths(paths, connection=None, upload=False, pickle=False, canonicalize=True, infer_central_dogma=True, enrich_citations=False, send=False, version_in_path=False, **kwargs): """Recursively parses and either uploads/pickles graphs in a given set of files :param iter[str] paths: The paths to convert :param connection: The connection :type connection: None or str or pybel.manager.Manager :param bool upload: Should the networks be uploaded to the cache? :param bool pickle: Should the networks be saved as pickles? :param bool canonicalize: Calculate canonical nodes? :param bool infer_central_dogma: Should the central dogma be inferred for all proteins, RNAs, and miRNAs :param bool enrich_citations: Should the citations be enriched using Entrez Utils? :param bool send: Send to PyBEL Web? :param bool version_in_path: Add the current pybel version to the pathname :param kwargs: Parameters to pass to :func:`pybel.from_path` """ manager = Manager.ensure(connection) failures = [] for path in paths: log.info('parsing: %s', path) try: graph = from_path(path, manager=manager, **kwargs) except Exception as e: log.exception('problem parsing %s', path) failures.append((path, e)) continue if canonicalize: add_canonical_names(graph) if infer_central_dogma: infer_central_dogma_mutator(graph) if enrich_citations: enrich_pubmed_citations(graph=graph, manager=manager) if upload: to_database(graph, connection=manager, store_parts=True) if pickle: name = path[:-len( '.bel')] # gets rid of .bel at the end of the file name if version_in_path: new_path = '{}-{}.gpickle'.format(name, get_pybel_version()) else: new_path = '{}.gpickle'.format(name) to_pickle(graph, new_path) log.info('output pickle: %s', new_path) if send: response = to_web(graph) log.info('sent to PyBEL Web with response: %s', response.json()) return failures
def get_graph( self, directory: Optional[str] = None, use_cached: bool = True, use_tqdm: bool = True, ) -> BELGraph: """Get the graph from all sources.""" if directory is None: if self.directory is None: raise ValueError directory = self.directory pickle_path = os.path.join(directory, f'{self.name}.bel.pickle') if use_cached and os.path.exists(pickle_path): return pybel.from_pickle(pickle_path) rv = union(self.get_graphs(use_tqdm=use_tqdm)) self.metadata.update(rv) pybel.to_pickle(rv, pickle_path) nodelink_path = os.path.join(directory, f'{self.name}.bel.nodelink.json') pybel.to_json_path(rv, nodelink_path) sif_path = os.path.join(directory, f'{self.name}.bel.sif') pybel.to_sif_path(rv, sif_path) gsea_path = os.path.join(directory, f'{self.name}.bel.gmt') pybel.to_gsea_path(rv, gsea_path) graphml_path = os.path.join(directory, f'{self.name}.bel.graphml') pybel.to_graphml(rv, graphml_path) try: statements = pybel.to_indra_statements(rv) except ImportError: pass else: indra_path = os.path.join(directory, f'{self.name}.indra.pickle') with open(indra_path, 'wb') as file: pickle.dump(statements, file) try: from pybel_cx import to_cx_file except ImportError: pass else: cx_path = os.path.join(directory, f'{self.name}.bel.cx.json') with open(cx_path, 'w') as file: to_cx_file(rv, file) try: from pybel_tools.assembler.html import to_html except ImportError: pass else: html_path = os.path.join(directory, 'index.html') with open(html_path, 'w') as file: print(to_html(rv), file=file) return rv
def generate_universe(kegg_path=KEGG_FILES, reactome_path=REACTOME_FILES, wikipathways_path=WIKIPATHWAYS_FILES, output=UNIVERSE_DIR, no_flatten=False, no_normalize_names=False, specie='Homo_sapiens'): """Export harmonized PathMe universe.""" flatten = not no_flatten normalize_names = not no_normalize_names # Specie name treatment. specie = specie.replace(' ', '_').capitalize() specie_altern_name = get_common_or_name_specie_id(specie).replace( ' ', '_').capitalize() if not flatten: click.secho( 'Complexes and Reactions will be not be flatten to single nodes') if not normalize_names: click.secho('Names will not be normalized to lower case') # KEGG specie processing kegg_species_dir_list = get_dir_list(kegg_path, True) kegg_path = os.path.join(kegg_path, specie) if specie not in kegg_species_dir_list and specie_altern_name not in kegg_species_dir_list: kegg_ids = get_all_pathways_organism(get_pathway_kegg_url(specie)) click.secho( 'You are about to download KGML files from KEGG.\n' 'Please make sure you have read KEGG license (see: https://www.kegg.jp/kegg/rest/).' ' These files cannot be distributed and their use must be exclusively with academic purposes.\n' 'We (PathMe developers) are not responsible for the end use of this data.\n', ) os.makedirs(kegg_path) download_kgml_files(kegg_ids, path=kegg_path) # Reactome specie processing specie_file = f'{specie}.owl' specie_alt_file = f'{specie_altern_name}.owl' reactome_species_file_list = get_or_create_dir(reactome_path) if specie_file in reactome_species_file_list: reactome_path = os.path.join(reactome_path, specie_file) elif specie_alt_file in reactome_species_file_list: reactome_path = os.path.join(reactome_path, specie_alt_file) else: click.secho('Specie not found in the populated Reactome resources.') # WikiPathways specie processing wikipath_species_dir_list = get_dir_list(wikipathways_path, True) if specie in wikipath_species_dir_list: wikipathways_path = os.path.join(wikipathways_path, specie) elif specie_altern_name in wikipath_species_dir_list: wikipathways_path = os.path.join(wikipathways_path, specie_altern_name) else: click.secho( 'Specie not found in the populated Wikipathways resources.') click.secho( "Merging graphs to universe and harmonizing...(this might take a while)" ) # Not explode will flip the boolean coming from the cli universe_graph = get_universe_graph( kegg_path=kegg_path, reactome_path=reactome_path, wikipathways_path=wikipathways_path, flatten=flatten, normalize_names=normalize_names, ) click.secho( f'Number of isolates after getting universe: {nx.number_of_isolates(universe_graph)}' ) # Remove isolated list abundances remove_isolated_list_abundances(universe_graph) if flatten: # TODO: Remove node list solo de Reactome click.secho( f'Number of isolates after flattening: {nx.number_of_isolates(universe_graph)}' ) click.secho("Merging variants and genes") collapse_all_variants(universe_graph) collapse_to_genes(universe_graph) click.secho( f'Number of isolates after collapsing variants and to genes: {nx.number_of_isolates(universe_graph)}' ) universe_graph.name = 'PathMe Universe' file_name = os.path.join(output, '_'.join([specie, 'pathme_universe.pickle'])) click.secho(f"Export BEL graph to: {file_name}") click.secho(universe_graph.summary_str()) click.secho(count_functions(universe_graph)) to_pickle(universe_graph, file_name)