Ejemplo n.º 1
0
 def test_example_pickle(self):
     """Test the round-trip through a pickle."""
     bio = BytesIO()
     to_pickle(sialic_acid_graph, bio)
     bio.seek(0)
     graph = from_pickle(bio)
     self._help_test_equal(graph)
Ejemplo n.º 2
0
def upload_jgf_directory(directory: str, manager: Manager):
    """Upload CBN data to edge store."""
    if not (os.path.exists(directory) and os.path.isdir(directory)):
        logger.warning('directory does not exist: %s', directory)
        return

    t = time.time()

    for path in iter_jgf(directory):
        gpickle_path = get_jgf_corresponding_gpickle_path(path)

        if os.path.exists(gpickle_path):
            graph = from_pickle(gpickle_path)
            strip_annotations(graph)
        else:
            with open(path) as f:
                cbn_jgif_dict = json.load(f)

            graph = pybel.from_cbn_jgif(cbn_jgif_dict)
            strip_annotations(graph)
            to_pickle(graph, gpickle_path)

        try:
            insert_graph(manager, graph, public=True, use_tqdm=True)
        except OperationalError:
            manager.session.rollback()
            logger.info('could not insert %s', graph)

    logger.info('done in %.2f seconds', time.time() - t)
Ejemplo n.º 3
0
def reactome_to_bel(resource_file: str,
                    hgnc_manager,
                    chebi_manager,
                    export_folder=REACTOME_BEL):
    """Create Reactome BEL graphs.

    :param resource_file: rdf reactome file (there is only one)
    :param bio2bel_hgnc.Manager hgnc_manager: uniprot id to hgnc symbol dictionary
    :return:
    """
    logger.info('Parsing Reactome RDF file')
    rdf_graph = parse_rdf(resource_file, fmt='xml')

    pathways_uris_to_names = rdf_graph.query(GET_ALL_PATHWAYS, initNs=PREFIXES)

    for pathway_uri, _pathway_name in tqdm(
            pathways_uris_to_names,
            desc=f'Exporting Reactome BEL to {export_folder}'):

        # Take the identifier of the pathway which is placed at the end of the URL and also strip the number
        # next to it. (probably version of pathway)
        file_name = pathway_uri.split('/')[-1].split('.')[0]

        pickle_file = os.path.join(export_folder, f'{file_name}.pickle')

        # Skip if BEL file already exists
        if os.path.exists(pickle_file):
            continue

        bel_graph = reactome_pathway_to_bel(pathway_uri, rdf_graph,
                                            hgnc_manager, chebi_manager)

        # Export BELGraph to pickle
        to_pickle(bel_graph, pickle_file)
Ejemplo n.º 4
0
def get_drugbank_graph(rebuild: bool = False, **kwargs) -> pybel.BELGraph:
    """Get the DrugBank graph."""
    if not rebuild and os.path.exists(DEFAULT_DRUGBANK_PICKLE):
        return pybel.from_pickle(DEFAULT_DRUGBANK_PICKLE)

    import bio2bel_drugbank

    drugbank_manager = bio2bel_drugbank.Manager()
    if not drugbank_manager.is_populated():
        drugbank_manager.populate()
    drugbank_graph = drugbank_manager.to_bel(**kwargs)

    if os.path.exists(RESOURCES):
        pybel.to_pickle(drugbank_graph, DEFAULT_DRUGBANK_PICKLE)

    return drugbank_graph
Ejemplo n.º 5
0
def get_sider_graph(rebuild: bool = False) -> pybel.BELGraph:
    """Get the SIDER graph."""
    if not rebuild and os.path.exists(DEFAULT_SIDER_PICKLE):
        return pybel.from_pickle(DEFAULT_SIDER_PICKLE)

    import bio2bel_sider

    sider_manager = bio2bel_sider.Manager()
    if not sider_manager.is_populated():
        sider_manager.populate()
    sider_graph = sider_manager.to_bel()

    if os.path.exists(RESOURCES):
        pybel.to_pickle(sider_graph, DEFAULT_SIDER_PICKLE)

    return sider_graph
Ejemplo n.º 6
0
def upload_neurommsig_graphs(manager: Manager):
    """Only upload NeuroMMSig Sample Networks."""
    if not (os.path.exists(alzheimer_directory)
            and os.path.isdir(alzheimer_directory)):
        logger.warning('directory does not exist: %s', alzheimer_directory)
        return

    if not os.path.exists(neurommsig_directory):
        logger.info('created neurommsig directory: %s', neurommsig_directory)
        os.makedirs(neurommsig_directory)

    path = os.path.join(alzheimer_directory, 'alzheimers.bel')
    gpickle_path = os.path.join(alzheimer_directory, 'alzheimers.gpickle')

    if os.path.exists(gpickle_path):
        graph = from_pickle(gpickle_path)
    elif os.path.exists(path):
        graph = from_bel_script(path, manager=manager)
        to_pickle(graph, gpickle_path)
    else:
        raise RuntimeError('missing NeuroMMSig source file: {}'.format(path))

    subgraphs = {
        name: subgraph
        for name, subgraph in get_subgraphs_by_annotation(
            graph, annotation='Subgraph').items()
        if name in neurommsig_sample_networks
    }

    networks = []

    for subgraph_name, subgraph in subgraphs.items():
        subgraph.name = 'NeuroMMSig AD {}'.format(subgraph_name)
        subgraph.authors = 'Daniel Domingo-Fernandez et. al'
        subgraph.version = graph.version
        subgraph.license = graph.license

        # output to directory as gpickle
        to_pickle(
            subgraph,
            os.path.join(neurommsig_directory,
                         '{}.gpickle'.format(subgraph_name)))

        network = insert_graph(manager, subgraph, public=True, use_tqdm=True)
        networks.append(network)

    write_manifest(neurommsig_directory, networks)
Ejemplo n.º 7
0
def get_combined_graph_similarity(
        *,
        fullgraph_path=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE,
        chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE,
        mapping_file=DEFAULT_MAPPING_PATH,
        new_graph_path=DEFAULT_GRAPH_PATH,
        pickle_graph_path=DEFAULT_FULLGRAPH_PICKLE,
        rebuild: bool = False):
    """Combine chemical similarity graph with the fullgraph."""
    if not rebuild and os.path.exists(DEFAULT_GRAPH_PATH):
        return nx.read_edgelist(DEFAULT_GRAPH_PATH)
    if type(fullgraph_path) == pybel.struct.graph.BELGraph:
        fullgraph_without_chemsim = fullgraph_path
    else:
        fullgraph_without_chemsim = pybel.from_pickle(fullgraph_path)
    if type(chemsim_graph_path) == pybel.struct.graph.BELGraph:
        chemsim_graph = chemsim_graph_path
    else:
        chemsim_graph = pybel.from_pickle(chemsim_graph_path)

    mapping_df = pd.read_csv(
        mapping_file,
        sep="\t",
        dtype={
            'identifier': str,
            'node_id': str
        },
        index_col=False,
    )
    fullgraph_with_chemsim = fullgraph_without_chemsim + chemsim_graph
    pybel.to_pickle(fullgraph_with_chemsim, pickle_graph_path)
    relabel_graph = {}
    for ind, row in mapping_df.iterrows():
        if row['namespace'] == PUBCHEM_NAMESPACE:
            relabel_graph[pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=row['identifier'])] = \
                row['node_id']
        elif row['namespace'] == UNIPROT_NAMESPACE:
            relabel_graph[pybel.dsl.Protein(namespace=UNIPROT_NAMESPACE,
                                            identifier=row['identifier'],
                                            name=row['name'])] = row['node_id']
        else:
            relabel_graph[pybel.dsl.Pathology(namespace='umls', identifier=row['identifier'], name=row['name'])] = \
                row['node_id']

    nx.relabel_nodes(fullgraph_with_chemsim, relabel_graph, copy=False)
    nx.write_edgelist(fullgraph_with_chemsim, new_graph_path, data=False)
    return fullgraph_with_chemsim
Ejemplo n.º 8
0
def subgraphs_to_pickles(network, annotation, directory=None):
    """Groups the given graph into subgraphs by the given annotation with :func:`get_subgraph_by_annotation` and
    outputs them as gpickle files to the given directory with :func:`pybel.to_pickle`

    :param pybel.BELGraph network: A BEL network
    :param str annotation: An annotation to split by. Suggestion: ``Subgraph``
    :param Optional[str] directory: A directory to output the pickles
    """
    directory = directory or os.getcwd()

    for value in get_annotation_values(network, annotation):
        sg = get_subgraph_by_annotation_value(network, annotation, value)
        sg.document.update(network.document)

        file_name = '{}_{}.gpickle'.format(annotation, value.replace(' ', '_'))
        path = os.path.join(directory, file_name)
        to_pickle(sg, path)
Ejemplo n.º 9
0
def merge_directory(manager, directory, name, debug):
    """Parses all BEL files in a directory and outputs it"""
    set_debug_param(debug)

    name = name or '{}-merged.gpickle'.format(directory)
    path = os.path.join(directory, name)
    if os.path.exists(path):
        click.echo('Path already exists. Quitting. [{}]'.format(path))

    from . import from_directory
    from pybel import to_pickle

    enable_cool_mode()

    graph = from_directory(directory, connection=manager)

    to_pickle(graph, file=path)
Ejemplo n.º 10
0
def get_graph_by_manager(
    module: Union[str, ModuleType, BELManagerMixin, Type[BELManagerMixin]],
    force: bool = False,
    to_bel_kwargs: Optional[Mapping[str, Any]] = None,
) -> BELGraph:
    """Get a graph for a manager."""
    if isinstance(module, str):  # get the cache or import that module
        _pickle_path = os.path.join(RESOURCES, f'{module}.bel.pickle')
        if os.path.exists(_pickle_path) and not force:
            logger.info(f'Getting {module} from pickle at {_pickle_path}')
            return from_pickle(_pickle_path)

        module_name = f'bio2bel_{module}'
        _module = importlib.import_module(module_name)
        manager = _module.Manager()
    elif isinstance(module, BELManagerMixin):
        manager = module
    elif isinstance(module, ModuleType):
        manager = module.Manager()
    elif isinstance(module, type):
        if not issubclass(module, BELManagerMixin):
            raise TypeError(f'{module} is not a subclass of BELManagerMixin')
        manager = module()
    else:
        raise TypeError(f'{module} has invalid type: {type(module)}')

    pickle_path = os.path.join(RESOURCES, f'{manager.module_name}.bel.pickle')
    if os.path.exists(pickle_path) and not force:
        logger.info(
            f'Getting {manager.module_name} from pickle at {pickle_path}')
        return from_pickle(pickle_path)

    if not manager.is_populated():
        logger.info(f'Populating manager for {manager.module_name}')
        manager.populate()

    graph = manager.to_bel(**(to_bel_kwargs or {}))
    logger.info(graph.summary_str())
    logger.info(str(count_namespaces(graph)))
    logger.info(str(count_functions(graph)))

    logger.info(f'Writing pickle for {pickle_path}')
    to_pickle(graph, pickle_path)
    return graph
Ejemplo n.º 11
0
def subgraphs_to_pickles(graph, directory=None, annotation='Subgraph'):
    """Groups the given graph into subgraphs by the given annotation with :func:`get_subgraph_by_annotation` and
    outputs them as gpickle files to the given directory with :func:`pybel.to_pickle`

    :param graph: A BEL Graph
    :type graph: pybel.BELGraph
    :param directory: A directory to output the pickles
    :type directory: str
    :param annotation: An annotation to split by. Suggestion: ``Subgraph``
    :type annotation: str
    """
    directory = os.getcwd() if directory is None else directory
    for value in get_annotation_values(graph, annotation=annotation):
        sg = get_subgraph_by_annotation_value(graph, annotation, value)
        sg.document.update(graph.document)

        file_name = '{}_{}.gpickle'.format(annotation, value.replace(' ', '_'))
        path = os.path.join(directory, file_name)
        to_pickle(sg, path)
Ejemplo n.º 12
0
    def save_model(self, path, output_format=None):
        """Save the :class:`pybel.BELGraph` using one of the outputs from
        :py:mod:`pybel`

        Parameters
        ----------
        path : str
            The path to output to
        output_format : Optional[str]
            Output format as ``cx``, ``pickle``, ``json`` or defaults to ``bel``
        """
        if output_format == 'pickle':
            pybel.to_pickle(self.model, path)
        else:
            with open(path, 'w') as fh:
                if output_format == 'json':
                    pybel.to_nodelink_file(self.model, fh)
                elif output_format == 'cx':
                    pybel.to_cx_file(self.model, fh)
                else: # output_format == 'bel':
                    pybel.to_bel_script(self.model, fh)
Ejemplo n.º 13
0
    def save_model(self, path, output_format=None):
        """Save the :class:`pybel.BELGraph` using one of the outputs from
        :py:mod:`pybel`

        Parameters
        ----------
        path : str
            The path to output to
        output_format : Optional[str]
            Output format as ``cx``, ``pickle``, ``json`` or defaults to ``bel``
        """
        if output_format == 'pickle':
            pybel.to_pickle(self.model, path)
        else:
            with open(path, 'w') as fh:
                if output_format == 'json':
                    pybel.to_json_file(self.model, fh)
                elif output_format == 'cx':
                    pybel.to_cx_file(self.model, fh)
                else: # output_format == 'bel':
                    pybel.to_bel(self.model, fh)
Ejemplo n.º 14
0
def run_one(directory):
    """

    :param str directory:
    """
    if not os.path.isdir(directory):
        return

    if not os.path.exists(os.path.join(directory, 'config.yaml')):
        return  # Skip non-INDRA machine directories

    log.info('running in %s', directory)
    # Use config=None so it looks in the subdirectory
    run_with_search_helper(directory, config=None)

    default_config_fname = os.path.join(directory, 'config.yaml')
    config = get_config(default_config_fname)
    ndex_cred = get_ndex_cred(config)
    name = ndex_cred.get('name') if ndex_cred is not None else None

    model = load_model(directory)
    stmts = model.get_statements()

    if not stmts:
        log.warning('no statements')
        return

    # Output CX
    if name:
        cx_str = assemble_cx(stmts, name)
        with open(os.path.join(directory, 'output.cx'), 'w') as file:
            print(cx_str, file=file)

    # Output BEL gpickle
    bel_graph = pybel.from_indra_statements(stmts)
    pybel.to_pickle(bel_graph, os.path.join(directory, 'output.gpickle'))

    return True
Ejemplo n.º 15
0
def kegg_to_pickles(resource_files,
                    resource_folder,
                    hgnc_manager,
                    chebi_manager,
                    flatten=None,
                    export_folder=None):
    """Export WikiPathways to Pickles.

    :param iter[str] resource_files: iterator with file names
    :param str resource_folder: path folder
    :param Optional[str] export_folder: export folder
    """
    if export_folder is None:
        export_folder = resource_folder

    for kgml_file in tqdm.tqdm(
            resource_files, desc=f'Exporting KEGG to BEL in {export_folder}'):
        _name = kgml_file[:-len('.xml')]
        _flatten = 'flatten' if flatten else 'unflatten'

        # Name of file created will be: "hsaXXX_unflatten.pickle" or "hsaXXX_flatten.pickle"
        pickle_path = os.path.join(
            export_folder if export_folder else KEGG_BEL,
            f'{_name}_{_flatten}.pickle',
        )

        # Skip not KGML files or file already exists
        if not kgml_file.endswith('.xml') or os.path.exists(pickle_path):
            continue

        bel_graph = kegg_to_bel(
            path=os.path.join(resource_folder, kgml_file),
            hgnc_manager=hgnc_manager,
            chebi_manager=chebi_manager,
            flatten=True if flatten else False,
        )

        to_pickle(bel_graph, pickle_path)
Ejemplo n.º 16
0
def write_bel(connection, skip, directory, force):
    """Write all as BEL."""
    os.makedirs(directory, exist_ok=True)
    from .manager.bel_manager import BELManagerMixin
    import pybel
    for _, name, manager in _iterate_managers(connection, skip):
        if not isinstance(manager, BELManagerMixin):
            continue
        click.secho(name, fg='cyan', bold=True)
        path = os.path.join(directory, f'{name}.bel.pickle')
        if os.path.exists(path) and not force:
            click.echo('👍 already exported')
            continue

        if not manager.is_populated():
            click.echo('👎 unpopulated')
        else:
            graph = manager.to_bel()
            pybel.to_pickle(graph, path)
            pybel.to_nodelink_gz(
                graph, os.path.join(directory, f'{name}.bel.nodelink.json.gz'))
            pybel.to_bel_script_gz(graph,
                                   os.path.join(directory, f'{name}.bel.gz'))
Ejemplo n.º 17
0
def convert_recursive(directory, connection=None, upload=False, pickle=False, store_parts=False,
                      enrich_citations=False):
    """Recursively parses and either uploads/pickles graphs in a given directory and sub-directories"""
    metadata_parser = build_metadata_parser(connection)
    paths = list(get_paths_recursive(directory))
    log.info('Paths to parse: %s', paths)

    for path in paths:
        try:
            graph = from_path(path, manager=metadata_parser.manager)
        except:
            log.exception('Problem parsing %s', path)
            continue

        if enrich_citations:
            fix_pubmed_citations(graph)

        if upload:
            safe_upload(metadata_parser.manager, graph, store_parts=store_parts)

        if pickle:
            new_path = '{}.gpickle'.format(path[:-4])  # [:-4] gets rid of .bel at the end of the file name
            to_pickle(graph, new_path)
Ejemplo n.º 18
0
def wikipathways_to_pickles(
    resource_files: Iterable[str],
    resource_folder: str,
    hgnc_manager: bio2bel_hgnc.Manager,
    export_folder: str,
) -> None:
    """Export WikiPathways to Pickles.

    :param resource_files: iterator with file names
    :param resource_folder: path folder
    :param hgnc_manager: HGNC manager
    :param export_folder: export folder
    """
    for rdf_file in tqdm.tqdm(
            resource_files,
            desc=f'Exporting WikiPathways to BEL in {export_folder}'):
        if rdf_file.endswith('.ttl'):
            pickle_name = rdf_file[:-len('.ttl')]
        else:
            pickle_name = rdf_file

        pickle_path = os.path.join(export_folder, f'{pickle_name}.pickle')

        # Skip if BEL file already exists
        # TODO: Remove pathway from blacklist
        if os.path.exists(pickle_path) or rdf_file in WIKIPATHWAYS_BLACKLIST:
            continue

        # Parse pathway rdf_file and logger stats
        pathway_path = os.path.join(resource_folder, rdf_file)

        bel_graph = wikipathways_to_bel(pathway_path, hgnc_manager)

        debug_pathway_info(bel_graph, pathway_path)

        # Export BELGraph to pickle
        to_pickle(bel_graph, pickle_path)
Ejemplo n.º 19
0
def main(connection):
    """Parse a network, load it to the database, then test how fast it drops."""
    manager = pybel.Manager(connection)

    if os.path.exists(PICKLE):
        print(f'opening from {PICKLE}')
        graph = pybel.from_pickle(PICKLE)
    else:
        with time_me(f'opening from {SMALL_CORPUS_URL}'):

            graph = pybel.from_url(SMALL_CORPUS_URL, manager=manager, use_tqdm=True, citation_clearing=False)

        pybel.to_pickle(graph, PICKLE)

    n = 1
    # FIXME this fails if you do it with the same manager

    times = [
        get_numbers(graph, manager)
        for _ in range(n)
    ]

    print(times)
    print(sum(times) / n)
Ejemplo n.º 20
0
import sys
import pickle
import pybel
from pybel.struct.filters import has_protein_modification
from indra.sources import bel
from indra.sources.bel.processor import get_agent
from .util import get_mod_sites

if __name__ == '__main__':
    # Parse the BEL script, takes a few minutes
    if sys.argv[1] == 'parse_belscript':
        input_file = sys.argv[2]
        output_file = sys.argv[3]
        pbg = pybel.from_path(input_file)
        pybel.to_pickle(pbg, output_file)
    # Get all variant sites from the graph
    #elif sys.argv[1] == 'get_pybel_mod_agents':
    #    pbg = pybel.from_pickle('output/large_corpus_pybel.pkl')
    #    mod_nodes = [get_agent(n) for n in pbg.nodes()
    #                 if has_protein_modification(n)]
    #    with open('output/bel_mod_agents.pkl', 'wb') as f:
    #        pickle.dump(mod_nodes, f)
    elif sys.argv[1] == 'get_pybel_stmts_by_site':
        input_file = sys.argv[2]
        output_file = sys.argv[3]
        pbg = pybel.from_pickle(input_file)
        pbp = bel.process_pybel_graph(pbg)
        sites = get_mod_sites(pbp.statements)
        with open(output_file, 'wb') as f:
            pickle.dump(sites, f)
    else:
Ejemplo n.º 21
0
 def test_thorough_pickle(self):
     bio = BytesIO()
     to_pickle(self.thorough_graph, bio)
     bio.seek(0)
     graph = from_pickle(bio)
     self.bel_thorough_reconstituted(graph)
Ejemplo n.º 22
0
 def test_example_pickle(self):
     bio = BytesIO()
     to_pickle(sialic_acid_graph, bio)
     bio.seek(0)
     graph = from_pickle(bio)
     self.help_test_equal(graph)
Ejemplo n.º 23
0
def get_graph(
    force: bool = False,
    force_global: bool = False,
    names: Optional[NamesList] = None,
    resources_directory: Optional[str] = None,
) -> BELGraph:
    """Get all resources in a combine BELGraph.

    :param force: Should cached files be overwritten?
    :param force_global: Should the global cache file be overwritten?
    :param names: The name of the bio2bel packages to use and arguments
    :param resources_directory: A non-default place to store the resources
    """
    pickle_path = os.path.join(resources_directory or RESOURCES, CACHE_NAME)
    if not force_global and os.path.exists(pickle_path):
        logger.info(f'Getting cached full graph')
        return from_pickle(pickle_path)

    if names is None:
        names = DEFAULT_NAMES

    logger.info('Generating graphs')
    graphs = []
    for name, to_bel_kwargs in names:
        _graph = get_graph_by_manager(name,
                                      force=force,
                                      to_bel_kwargs=to_bel_kwargs)
        logger.info(_graph.summary_str())
        graphs.append(_graph)

    logger.info('Merging graphs')
    graph = pybel.union(graphs)
    graph.name = f'Graph from: {", ".join(graph.name for graph in graphs)}'
    graph.version = '0.0.1'
    logger.info('Finished merging graphs')

    logger.info('Preparing HGNC mappings')
    hgnc_manager = bio2bel_hgnc.Manager()
    hgnc_symbol_to_id = hgnc_manager.build_hgnc_symbol_id_mapping()
    entrez_id_to_hgnc_symbol = hgnc_manager.build_entrez_id_to_hgnc_symbol_mapping(
    )

    logger.info('Generating namespace mapping for nodes')
    mapping = {}
    for node in graph:
        namespace = node.get('namespace')
        if namespace is None:
            continue
        elif namespace.lower() in {
                'ncbigene', 'egid'
        } and node.identifier in entrez_id_to_hgnc_symbol:
            name = entrez_id_to_hgnc_symbol[node.identifier]
            identifier = hgnc_symbol_to_id[name]
            mapping[node] = node.__class__(
                namespace='hgnc',
                name=name,
                identifier=identifier,
            )

    logger.info('Relabeling nodes')
    nx.relabel_nodes(graph, mapping, copy=False)

    logger.info('Enriching central dogma')
    enrich_protein_and_rna_origins(graph)

    logger.info('Exporting snp2k pickle')
    to_pickle(graph, pickle_path)
    return graph
Ejemplo n.º 24
0
def get_similarity_graph(
    *,
    fullgraph=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE,
    rebuild: bool = False,
    mapping_file=DEFAULT_CHEMICALS_MAPPING_PATH,
    chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE,
    clustered: bool = True,
    similarity=0.7,
    name='Chemical Similarity Graph',
    version='1.1.0',
    authors='',
    contact='',
    description='',
):
    """
    Create a BELGraph with chemicals as nodes, and similarity as edges.

    :param similarity: the percent in which the chemicals are similar
    :param mapping_file: an existing dataframe with pubchemIDs and Smiles
    """
    if not rebuild and os.path.exists(DEFAULT_CHEMSIM_PICKLE):
        return nx.read_edgelist(DEFAULT_CHEMSIM_PICKLE)
    if type(fullgraph) == pybel.struct.graph.BELGraph:
        fullgraph_without_chemsim = fullgraph
    else:
        fullgraph_without_chemsim = pybel.from_pickle(fullgraph)
    pubchem_ids = []
    for node in fullgraph_without_chemsim.nodes():
        if node.namespace != 'pubchem.compound':
            continue
        pubchem_ids.append(node.identifier)

    if os.path.exists(mapping_file):
        chemicals_mapping = pd.read_csv(
            mapping_file,
            sep="\t",
            dtype={
                'PubchemID': str,
                'Smiles': str
            },
            index_col=False,
        )
        pubchem_id_to_smiles = {}
        new_chemicals = []
        smiles = []
        for pubchem_id in tqdm(pubchem_ids, desc="Getting SMILES"):
            if chemicals_mapping.loc[chemicals_mapping["PubchemID"] ==
                                     pubchem_id].empty:
                chemical_smiles = cid_to_smiles(pubchem_id)
                if not isinstance(chemical_smiles, str):
                    chemical_smiles = chemical_smiles.decode("utf-8")
                pubchem_id_to_smiles[pubchem_id] = chemical_smiles
                new_chemicals.append(pubchem_id)
                smiles.append(chemical_smiles)
            else:
                pubchem_id_to_smiles[pubchem_id] = chemicals_mapping.loc[
                    chemicals_mapping["PubchemID"] == pubchem_id,
                    "Smiles"].iloc[0]
        new_df = pd.DataFrame({"PubchemID": new_chemicals, "Smiles": smiles})
        chemicals_mapping = chemicals_mapping.append(new_df)
        chemicals_mapping.to_csv(mapping_file, sep='\t', index=False)
    else:
        pubchem_id_to_smiles = get_smiles(pubchem_ids)

    pubchem_id_to_fingerprint = get_fingerprints(pubchem_id_to_smiles)

    chemsim_graph = pybel.BELGraph(name, version, description, authors,
                                   contact)

    if clustered:
        clustered_df = cluster_chemicals(
            rebuild=True, chemicals_dict=pubchem_id_to_fingerprint)
        clusters = clustered_df['Cluster'].unique().tolist()
        for cluster in tqdm(clusters, desc='Creating similarity BELGraph'):
            chemicals = clustered_df.loc[clustered_df.Cluster == cluster]
            if len(chemicals) == 1:
                continue
            for ind, row in chemicals.iterrows():
                for ind1, row1 in chemicals.iterrows():
                    if row['PubchemID'] == row1['PubchemID']:
                        continue
                    chemical_01 = pybel.dsl.Abundance(
                        namespace='pubchem.compound',
                        identifier=row['PubchemID'])
                    chemical_02 = pybel.dsl.Abundance(
                        namespace='pubchem.compound',
                        identifier=row1['PubchemID'])
                    if chemsim_graph.has_edge(
                            chemical_01,
                            chemical_02) or chemsim_graph.has_edge(
                                chemical_02, chemical_01):
                        continue
                    chemsim_graph.add_unqualified_edge(chemical_01,
                                                       chemical_02,
                                                       'association')
    else:
        similarities = get_similarity(pubchem_id_to_fingerprint)
        for (source_pubchem_id, target_pubchem_id), sim in tqdm(
                similarities.items(), desc='Creating similarity BELGraph'):
            if sim < similarity:
                continue
            chemsim_graph.add_unqualified_edge(
                pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE,
                                    identifier=source_pubchem_id),
                pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE,
                                    identifier=target_pubchem_id),
                'association',
            )
    pybel.to_pickle(chemsim_graph, chemsim_graph_path)
    return chemsim_graph
Ejemplo n.º 25
0
def get_combined_sider_drugbank(
    *,
    rebuild: bool = False,
    drugbank_graph_path=None,
    sider_graph_path=None,
    chemical_mapping=DEFAULT_CHEMICALS_MAPPING_PATH,
):
    """
    Combine the SIDER and DrugBank graphs.

    :param drugbank_graph_path: the path to drugbank graph
    :param sider_graph_path: the path to sider graph
    :return: BELGraph
    """
    if not rebuild and os.path.exists(
            DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE):
        return pybel.from_pickle(DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE)
    if type(sider_graph_path) == pybel.struct.graph.BELGraph:
        sider_graph = sider_graph_path
    elif sider_graph_path is not None and os.path.exists(sider_graph_path):
        sider_graph = pybel.from_pickle(sider_graph_path)
    else:
        sider_graph = get_sider_graph()
    if type(drugbank_graph_path) == pybel.struct.graph.BELGraph:
        drugbank_graph = drugbank_graph_path
    elif drugbank_graph_path is not None and os.path.exists(
            drugbank_graph_path):
        drugbank_graph = pybel.from_pickle(drugbank_graph_path)
    else:
        drugbank_graph = get_drugbank_graph()
    smiles_dict = {}
    if chemical_mapping is not None:
        mapping_df = pd.read_csv(
            chemical_mapping,
            sep="\t",
            dtype={
                'PubchemID': str,
                'Smiles': str
            },
            index_col=False,
        )
    for node in tqdm(sider_graph.nodes()):
        if node.namespace != 'pubchem.compound':
            continue
        if node.identifier in mapping_df.values:
            smiles = mapping_df.loc[mapping_df['PubchemID'] == node.identifier,
                                    'Smiles'].iloc[0]
        else:
            smiles = cid_to_smiles(node.identifier)
            if not isinstance(smiles, str):
                smiles = smiles.decode("utf-8")
        smiles_dict[node] = smiles
    for node in tqdm(drugbank_graph.nodes()):
        if node.namespace != 'pubchem.compound':
            continue
        if node in smiles_dict.keys():
            continue
        if node.identifier in mapping_df.values:
            smiles = mapping_df.loc[mapping_df['PubchemID'] == node.identifier,
                                    'Smiles'].iloc[0]
        else:
            smiles = cid_to_smiles(node.identifier)
            if not isinstance(smiles, str):
                smiles = smiles.decode("utf-8")
        smiles_dict[node] = smiles
    sider_relabeled = nx.relabel_nodes(sider_graph, smiles_dict)
    drugbank_relabeled = nx.relabel_nodes(drugbank_graph, smiles_dict)
    full_graph = sider_relabeled + drugbank_relabeled
    smiles_dict_rev = {v: k for k, v in smiles_dict.items()}
    full_graph_relabel = nx.relabel_nodes(full_graph, smiles_dict_rev)
    if os.path.exists(RESOURCES):
        pybel.to_pickle(full_graph_relabel,
                        DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE)
    return full_graph_relabel
Ejemplo n.º 26
0
def convert_paths(paths,
                  connection=None,
                  upload=False,
                  pickle=False,
                  canonicalize=True,
                  infer_central_dogma=True,
                  enrich_citations=False,
                  send=False,
                  version_in_path=False,
                  **kwargs):
    """Recursively parses and either uploads/pickles graphs in a given set of files

    :param iter[str] paths: The paths to convert
    :param connection: The connection
    :type connection: None or str or pybel.manager.Manager
    :param bool upload: Should the networks be uploaded to the cache?
    :param bool pickle: Should the networks be saved as pickles?
    :param bool canonicalize: Calculate canonical nodes?
    :param bool infer_central_dogma: Should the central dogma be inferred for all proteins, RNAs, and miRNAs
    :param bool enrich_citations: Should the citations be enriched using Entrez Utils?
    :param bool send: Send to PyBEL Web?
    :param bool version_in_path: Add the current pybel version to the pathname
    :param kwargs: Parameters to pass to :func:`pybel.from_path`
    """
    manager = Manager.ensure(connection)

    failures = []

    for path in paths:
        log.info('parsing: %s', path)

        try:
            graph = from_path(path, manager=manager, **kwargs)
        except Exception as e:
            log.exception('problem parsing %s', path)
            failures.append((path, e))
            continue

        if canonicalize:
            add_canonical_names(graph)

        if infer_central_dogma:
            infer_central_dogma_mutator(graph)

        if enrich_citations:
            enrich_pubmed_citations(graph=graph, manager=manager)

        if upload:
            to_database(graph, connection=manager, store_parts=True)

        if pickle:
            name = path[:-len(
                '.bel')]  # gets rid of .bel at the end of the file name

            if version_in_path:
                new_path = '{}-{}.gpickle'.format(name, get_pybel_version())
            else:
                new_path = '{}.gpickle'.format(name)

            to_pickle(graph, new_path)

            log.info('output pickle: %s', new_path)

        if send:
            response = to_web(graph)
            log.info('sent to PyBEL Web with response: %s', response.json())

    return failures
Ejemplo n.º 27
0
    def get_graph(
        self,
        directory: Optional[str] = None,
        use_cached: bool = True,
        use_tqdm: bool = True,
    ) -> BELGraph:
        """Get the graph from all sources."""
        if directory is None:
            if self.directory is None:
                raise ValueError
            directory = self.directory

        pickle_path = os.path.join(directory, f'{self.name}.bel.pickle')
        if use_cached and os.path.exists(pickle_path):
            return pybel.from_pickle(pickle_path)

        rv = union(self.get_graphs(use_tqdm=use_tqdm))
        self.metadata.update(rv)

        pybel.to_pickle(rv, pickle_path)

        nodelink_path = os.path.join(directory,
                                     f'{self.name}.bel.nodelink.json')
        pybel.to_json_path(rv, nodelink_path)

        sif_path = os.path.join(directory, f'{self.name}.bel.sif')
        pybel.to_sif_path(rv, sif_path)

        gsea_path = os.path.join(directory, f'{self.name}.bel.gmt')
        pybel.to_gsea_path(rv, gsea_path)

        graphml_path = os.path.join(directory, f'{self.name}.bel.graphml')
        pybel.to_graphml(rv, graphml_path)

        try:
            statements = pybel.to_indra_statements(rv)
        except ImportError:
            pass
        else:
            indra_path = os.path.join(directory, f'{self.name}.indra.pickle')
            with open(indra_path, 'wb') as file:
                pickle.dump(statements, file)

        try:
            from pybel_cx import to_cx_file
        except ImportError:
            pass
        else:
            cx_path = os.path.join(directory, f'{self.name}.bel.cx.json')
            with open(cx_path, 'w') as file:
                to_cx_file(rv, file)

        try:
            from pybel_tools.assembler.html import to_html
        except ImportError:
            pass
        else:
            html_path = os.path.join(directory, 'index.html')
            with open(html_path, 'w') as file:
                print(to_html(rv), file=file)

        return rv
Ejemplo n.º 28
0
def generate_universe(kegg_path=KEGG_FILES,
                      reactome_path=REACTOME_FILES,
                      wikipathways_path=WIKIPATHWAYS_FILES,
                      output=UNIVERSE_DIR,
                      no_flatten=False,
                      no_normalize_names=False,
                      specie='Homo_sapiens'):
    """Export harmonized PathMe universe."""
    flatten = not no_flatten
    normalize_names = not no_normalize_names

    # Specie name treatment.
    specie = specie.replace(' ', '_').capitalize()
    specie_altern_name = get_common_or_name_specie_id(specie).replace(
        ' ', '_').capitalize()

    if not flatten:
        click.secho(
            'Complexes and Reactions will be not be flatten to single nodes')

    if not normalize_names:
        click.secho('Names will not be normalized to lower case')

    # KEGG specie processing
    kegg_species_dir_list = get_dir_list(kegg_path, True)
    kegg_path = os.path.join(kegg_path, specie)

    if specie not in kegg_species_dir_list and specie_altern_name not in kegg_species_dir_list:
        kegg_ids = get_all_pathways_organism(get_pathway_kegg_url(specie))
        click.secho(
            'You are about to download KGML files from KEGG.\n'
            'Please make sure you have read KEGG license (see: https://www.kegg.jp/kegg/rest/).'
            ' These files cannot be distributed and their use must be exclusively with academic purposes.\n'
            'We (PathMe developers) are not responsible for the end use of this data.\n',
        )
        os.makedirs(kegg_path)
        download_kgml_files(kegg_ids, path=kegg_path)

    # Reactome specie processing
    specie_file = f'{specie}.owl'
    specie_alt_file = f'{specie_altern_name}.owl'

    reactome_species_file_list = get_or_create_dir(reactome_path)

    if specie_file in reactome_species_file_list:
        reactome_path = os.path.join(reactome_path, specie_file)
    elif specie_alt_file in reactome_species_file_list:
        reactome_path = os.path.join(reactome_path, specie_alt_file)
    else:
        click.secho('Specie not found in the populated Reactome resources.')

    # WikiPathways specie processing
    wikipath_species_dir_list = get_dir_list(wikipathways_path, True)

    if specie in wikipath_species_dir_list:
        wikipathways_path = os.path.join(wikipathways_path, specie)
    elif specie_altern_name in wikipath_species_dir_list:
        wikipathways_path = os.path.join(wikipathways_path, specie_altern_name)
    else:
        click.secho(
            'Specie not found in the populated Wikipathways resources.')

    click.secho(
        "Merging graphs to universe and harmonizing...(this might take a while)"
    )

    # Not explode will flip the boolean coming from the cli
    universe_graph = get_universe_graph(
        kegg_path=kegg_path,
        reactome_path=reactome_path,
        wikipathways_path=wikipathways_path,
        flatten=flatten,
        normalize_names=normalize_names,
    )
    click.secho(
        f'Number of isolates after getting universe: {nx.number_of_isolates(universe_graph)}'
    )

    # Remove isolated list abundances
    remove_isolated_list_abundances(universe_graph)

    if flatten:
        # TODO: Remove node list solo de Reactome
        click.secho(
            f'Number of isolates after flattening: {nx.number_of_isolates(universe_graph)}'
        )

    click.secho("Merging variants and genes")
    collapse_all_variants(universe_graph)
    collapse_to_genes(universe_graph)
    click.secho(
        f'Number of isolates after collapsing variants and to genes: {nx.number_of_isolates(universe_graph)}'
    )

    universe_graph.name = 'PathMe Universe'

    file_name = os.path.join(output,
                             '_'.join([specie, 'pathme_universe.pickle']))
    click.secho(f"Export BEL graph to: {file_name}")
    click.secho(universe_graph.summary_str())
    click.secho(count_functions(universe_graph))

    to_pickle(universe_graph, file_name)