def plot_embedding(
    graph: EnsmallenGraph,
    tsne_embedding: np.ndarray,
    k: int = 10,
    axes: Axes = None
):
    if axes is None:
        _, axes = plt.subplots(figsize=(5, 5))
    if graph.node_types_mapping is None:
        node_types = np.zeros(graph.get_nodes_number(), dtype=np.uint8)
        common_node_types_names = ["No node type provided"]
    else:
        nodes, node_types = graph.get_top_k_nodes_by_node_type(k)
        tsne_embedding = tsne_embedding[nodes]
        common_node_types_names = list(np.array(graph.node_types_reverse_mapping)[np.unique(node_types)])
    colors = list(TABLEAU_COLORS.keys())[:len(common_node_types_names)]
    scatter = axes.scatter(
        *tsne_embedding.T,
        s=0.25,
        c=node_types,
        cmap=ListedColormap(colors)
    )
    axes.legend(
        handles=scatter.legend_elements()[0],
        labels=common_node_types_names
    )
    return axes
Esempio n. 2
0
def test_load_ppi() -> EnsmallenGraph:
    """Test that PPI can be loaded."""
    EnsmallenGraph.from_unsorted_csv(
        edge_path=os.path.join(ROOT_DIR, "data/ppi/edges.tsv"),
        sources_column="subject",
        destinations_column="object",
        directed=False,
        weights_column="weight",
        node_path="./pytests/data/ppi/nodes.tsv",
        nodes_column="id",
        node_types_column="molecular_function",
        default_node_type="Missing"
    )
def test_no_existent_column():
    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="NOT A REAL COLUMN",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="",
            directed=False,
            edge_types_column="NOT A REAL COLUMN",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="id",
            node_types_column="NOT A REAL COLUMN",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )

    with pytest.raises(ValueError):
        EnsmallenGraph.from_unsorted_csv(
            edge_path="./pytests/data/edges.tsv",
            sources_column="subject",
            destinations_column="object",
            directed=False,
            edge_types_column="edge_label",
            node_path="./pytests/data/nodes.tsv",
            nodes_column="NOT A REAL COLUMN",
            node_types_column="category",
            default_edge_type='biolink:interacts_with',
            default_node_type='biolink:NamedThing'
        )
Esempio n. 4
0
def load_pathway() -> EnsmallenGraph:
    """Test that Pathway can be loaded."""
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path="./pytests/data/pathway.tsv",
        sources_column="Gene_A",
        destinations_column="Gene_B",
        directed=False,
        name="Pathway")
    graph.enable()
    return graph
def plot_embedding_degrees_heatmap(
        graph: EnsmallenGraph,
        embedding: np.ndarray,
        axes: Axes = None,
        fig: Figure = None,
        max_degree=10
):
    if axes is None:
        fig, axes = plt.subplots(figsize=(10, 10), dpi=200)
    cm = plt.cm.get_cmap('RdYlBu')
    degrees = graph.degrees()
    degrees[degrees > max_degree] = min(max_degree, degrees.max())
    sc = axes.scatter(*embedding.T, c=degrees, s=0.1, cmap=cm)
    fig.colorbar(sc, ax=axes)
Esempio n. 6
0
def load_hpo() -> EnsmallenGraph:
    """Test that HPO graph can be loaded."""
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path="./pytests/data/edges.tsv",
        sources_column="subject",
        destinations_column="object",
        directed=False,
        edge_types_column="edge_label",
        node_path="./pytests/data/nodes.tsv",
        nodes_column="id",
        node_types_column="category",
        default_edge_type='biolink:interacts_with',
        default_node_type='biolink:NamedThing',
        name="HPO")
    graph.enable()
    return graph
Esempio n. 7
0
def execute_walks_ensmallen(graph: EnsmallenGraph,
                            length: int,
                            iterations: int,
                            nodes_number: int,
                            max_degree: int,
                            p: float = 1.0,
                            q: float = 1.0,
                            **kwargs: Dict) -> np.ndarray:
    """Execute first/second order walks using Ensmallen walker.

    Parameters
    --------------------------
    graph: EnsmallenGraph,
        The graph on which to run the walks.
    length: int,
        Lenght of the walks.
    iterations: int,
        Number of walks to start from each node.
    nodes_number: int,
        Number of nodes in the graph.
    max_degree: int,
        Maximum degree of the graph.
    p: float = 1.0,
        Inverse weight for making the walk local.
        By default, the walk will be uniform.
    q: float = 1.0,
        Inverse weight for making the walk a deep first.
        By default, the walk will be uniform.
    kwargs: Dict,
        Additional parameters to be used in other libraries but not this one.

    Returns
    --------------------------
    Computed walks as numpy array.
    """
    return graph.complete_walks(length=length,
                                iterations=iterations,
                                return_weight=1 / p,
                                explore_weight=1 / q,
                                max_neighbours=10_100 if max_degree > 10_000
                                and nodes_number > 500_000 else None)
def load_graph(node_type_column: str, default_node_type: str = "type"):
    """Load the graph with the specified node type column.

    Parameters
    ------------------------
    node_type_column:str,
        The column to be loaded.
    default_node_type:str="unknown",
        The default value to use when no node type is available for a given node.
    """
    return EnsmallenGraph.from_csv(
        edge_path="/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_intindex_head.txt",
        # edge_path="/global/homes/m/marcinj/data/ENIGMA/masterG.edgelist_col12_head.tsv",
        sources_column="subject",
        destinations_column="object",
        directed=False,
        # weights_column="weight",
        node_path="/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_nodes_meta_header.txt",
        # node_path="/global/homes/m/marcinj/data/ENIGMA/masterG.edgelist_col12_nodes_meta_header.txt",
        nodes_column="id",
        node_types_column=node_type_column,
        default_node_type=default_node_type
    )
Esempio n. 9
0
    c for c in go_yaml['classifier']['classifiers']
    if c['type'] == 'neural network'
][0]['model']['outfile']
mlp = tf.keras.models.load_model(os.path.join("output_data", mlp_file))

node_data = pd.read_csv('input_data/go_nodes.tsv', sep='\t')
node_data = node_data.filter(['id', 'name'])

#
# positive validation edges
#
pos_graph_args = go_yaml['graph_data']['graph']
pos_graph_args['directed'] = True
pos_graph_args['edge_path'] = go_yaml['graph_data']['pos_validation'][
    'edge_path']
pos_validation_graph = EnsmallenGraph.from_unsorted_csv(**pos_graph_args)
pos_edges = list(
    zip(pos_validation_graph.get_source_names(),
        pos_validation_graph.get_destination_names()))

pos_edge_transform = GraphTransformer(go_yaml['classifier']['edge_method'])
pos_edge_transform.fit(
    np.load(
        os.path.join("output_data",
                     go_yaml['embeddings']['embedding_file_name'])))
pos_edges_to_eval_emb = pos_edge_transform.transform(pos_validation_graph)

pos_valid_predict = mlp.predict(pos_edges_to_eval_emb, batch_size=1048)
pos_valid_predict_sorted = pd.DataFrame({
    "pred": pos_valid_predict.flatten(),
    "subject": [t[0] for t in pos_edges],
Esempio n. 10
0
from time import time
from humanize import naturaldelta
from ensmallen_graph import EnsmallenGraph
import compress_json
import json

start = time()
graph = EnsmallenGraph(edge_path="../embiggen/pos_train_edges.tsv",
                       sources_column="subject",
                       destinations_column="object",
                       directed=True,
                       edge_types_column="edge_label",
                       node_path="../embiggen/pos_train_nodes.tsv",
                       nodes_column="id",
                       node_types_column="category")
completed_graph = time() - start
start_walk = time()
graph.walk(iterations=10,
           length=80,
           min_length=0,
           return_weight=1,
           explore_weight=1,
           change_node_type_weight=1,
           change_edge_type_weight=1)
delta = time() - start
total_walk_time = time() - start_walk

response = {
    "required_time": delta,
    "human_time": naturaldelta(delta),
    "building_graph_required_time": completed_graph,
from tqdm.auto import tqdm
import numpy as np
from ensmallen_graph import EnsmallenGraph
from embiggen import GraphTransformer, EdgeTransformer

#try:
#    from tsnecuda import TSNE
#except ModuleNotFoundError:
from MulticoreTSNE import MulticoreTSNE as TSNE

embedding_path = "./FOURTH/SkipGram_embedding.npy"

graph = EnsmallenGraph.from_csv(
    #/global/homes/m/marcinj/data/ENIGMA
    edge_path=
    "/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_head.tsv",
    sources_column="subject",
    destinations_column="object",
    directed=False)

negative_graph = graph.sample_negatives(42, graph.get_edges_number(), False)

embedding = np.load(embedding_path)

for method in tqdm(EdgeTransformer.methods, desc="Methods", leave=False):
    tsne_path = f"tsne_edges_microbeenv"
    if os.path.exists(tsne_path):
        continue
    transformer = GraphTransformer(method)
    transformer.fit(embedding)
    positive_edges = transformer.transform(graph)
Esempio n. 12
0
def make_holdouts(nodes: str,
                  edges: str,
                  output_dir: str,
                  train_fraction: float,
                  validation: bool,
                  seed=42) -> None:
    """Prepare positive and negative edges for testing and training (see run.py holdouts
    command for documentation)

    Args:
        :param nodes    nodes of input graph, in KGX TSV format [data/merged/nodes.tsv]
        :param edges:   edges for input graph, in KGX TSV format [data/merged/edges.tsv]
        :param output_dir:     directory to output edges and new graph [data/edges/]
        :param train_fraction: fraction of edges to emit as training
        :param validation:     should we make validation edges? [False]
        :param seed:    random seed [42]
    Returns:
        None.
    """
    logging.basicConfig(level=logging.INFO)
    logging.info("Loading graph from nodes %s and edges %s files" %
                 (nodes, edges))
    graph = EnsmallenGraph.from_unsorted_csv(
        edge_path=edges,
        sources_column='subject',
        destinations_column='object',
        directed=False,
        edge_types_column='edge_label',
        default_edge_type='biolink:Association',
        node_path=nodes,
        nodes_column='id',
        default_node_type='biolink:NamedThing',
        node_types_column='category')

    os.makedirs(output_dir, exist_ok=True)

    # make positive edges
    logging.info("Making positive edges")
    pos_train_edges, pos_test_edges = graph.random_holdout(
        seed=seed, train_percentage=train_fraction)
    if validation:
        pos_valid_edges, pos_test_edges = \
            pos_test_edges.random_holdout(seed=seed,
                                          train_percentage=0.5)

    # make negative edges
    logging.info("Making negative edges")

    all_negative_edges = \
        pos_train_edges.sample_negatives(seed=seed,
                                         negatives_number=graph.get_edges_number(),
                                         allow_selfloops=False)
    neg_train_edges, neg_test_edges = \
        all_negative_edges.random_holdout(seed=seed, train_percentage=train_fraction)
    if validation:
        neg_test_edges, neg_valid_edges = \
            neg_test_edges.random_holdout(seed=seed, train_percentage=0.5)

    #
    # write out positive edges
    #
    # training:
    logging.info("Writing out positive edges")
    pos_train_edges_outfile = os.path.join(output_dir, "pos_train_edges.tsv")
    pos_train_nodes_outfile = os.path.join(output_dir, "pos_train_nodes.tsv")
    pos_test_edges_outfile = os.path.join(output_dir, "pos_test_edges.tsv")
    pos_valid_edges_outfile = os.path.join(output_dir, "pos_valid_edges.tsv")

    pos_train_edges.dump_edges(path=pos_train_edges_outfile)
    pos_train_edges.dump_nodes(path=pos_train_nodes_outfile)
    pos_test_edges.dump_edges(path=pos_test_edges_outfile)
    if validation:
        pos_valid_edges.dump_edges(path=pos_valid_edges_outfile)

    #
    # write out negative edges
    #
    logging.info("Writing out negative edges")
    neg_train_edges_outfile = os.path.join(output_dir, "neg_train_edges.tsv")
    neg_test_edges_outfile = os.path.join(output_dir, "neg_test_edges.tsv")
    neg_valid_edges_outfile = os.path.join(output_dir, "neg_valid_edges.tsv")

    neg_train_edges.dump_edges(path=neg_train_edges_outfile)
    neg_test_edges.dump_edges(path=neg_test_edges_outfile)
    if validation:
        neg_valid_edges.dump_edges(path=neg_valid_edges_outfile)
Esempio n. 13
0
def sanitize_graph(graph_data: str, root: str):
    """Convert all the graphs to a standard format.

    Parameters
    ----------
    graph_data: List[Dict],
        Informations of the graph to sanitize
    root: str,
        The working folder. All the files will be read and written from here.
    """
    kwargs = graph_data["loading_settings"]

    kwargs["edge_path"] = os.path.join(root, graph_data["folder_name"],
                                       graph_data["edge_file"])

    kwargs.setdefault("directed", False)

    directed_dst_path = os.path.join(root, graph_data["folder_name"],
                                     "directed_sanitized.tsv")

    undirected_dst_path = os.path.join(root, graph_data["folder_name"],
                                       "undirected_sanitized.tsv")

    report_path = os.path.join(root, graph_data["folder_name"], "report.json")

    textual_report_path = os.path.join(root, graph_data["folder_name"],
                                       "report.txt")

    if all(
            os.path.exists(p) for p in (directed_dst_path, undirected_dst_path,
                                        report_path, textual_report_path)):
        return

    logger.info("Loading the file %s" % kwargs["edge_path"])
    graph: EnsmallenGraph = EnsmallenGraph.from_unsorted_csv(
        **kwargs, name=graph_data["graph"])
    logger.info("Enabling fast version")
    graph.enable_fast_walk()
    logger.info("Computing metadata")
    if not os.path.exists(report_path):
        logger.info("Computing JSON report")
        report = graph.report()
        compress_json.dump(report, report_path)
    if not os.path.exists(textual_report_path):
        logger.info("Computing textual report")
        textual_report = str(graph)
        with open(textual_report_path, "w") as f:
            f.write(textual_report)

    if not os.path.exists(undirected_dst_path):
        logger.info("Writing the file {}".format(undirected_dst_path))
        graph.dump_edges(
            path=undirected_dst_path,
            header=False,
            sources_column_number=0,
            destinations_column_number=1,
            weights_column_number=2,
            numeric_node_ids=True,
            # We dump with directed=True for the undirected file to have in the file the bidirectional edges.
            directed=True)
    if not os.path.exists(directed_dst_path):
        logger.info("Writing the file {}".format(directed_dst_path))
        graph.dump_edges(
            path=directed_dst_path,
            header=False,
            sources_column_number=0,
            destinations_column_number=1,
            weights_column_number=2,
            numeric_node_ids=True,
            # We dump with directed=False for the directed file to have no doubled bidirectional edge in the write out.
            directed=False)
if not os.path.exists(hpo_json_file):
    os.system("wget http://purl.obolibrary.org/obo/hp.json -O hpo.json")

if not os.path.exists(hpo_edges_file) or not os.path.exists(hpo_nodes_file):
    os.system("kgx transform --input-format obojson --output-format tsv --output hpo hpo.json")

edge = 'biolink:subclass_of'
edges_string = edge
os.makedirs(edges_string, exist_ok=True)

graph = EnsmallenGraph.from_unsorted_csv(
    edge_path=hpo_edges_file,
    sources_column="subject",
    destinations_column="object",
    edge_types_column='edge_label',
    directed=False,
    node_path=hpo_nodes_file,
    nodes_column='id',
    node_types_column='category',
    default_node_type='biolink:NamedThing'
)

reduced_graph = graph.remove(singletons=True)
pos_training, pos_validation = reduced_graph.connected_holdout(
    train_size=train_percentage,
    edge_types=[edge],
    random_state=seed)

# make negative graph
neg_training, neg_validation = reduced_graph.sample_negatives(
   random_state=seed,
from time import time
from humanize import naturaldelta
from ensmallen_graph import EnsmallenGraph  # pylint: disable=no-name-in-module
import compress_json
import json
import numpy as np

start = time()
graph = EnsmallenGraph(edge_path="../graph/cooccurrence/edges.tsv",
                       sources_column="subject",
                       destinations_column="object",
                       directed=False,
                       validate_input_data=True)
completed_graph = time() - start
start_walk = time()

walks = graph.walk(iterations=1,
                   length=80,
                   return_weight=1,
                   explore_weight=1,
                   change_node_type_weight=1,
                   change_edge_type_weight=1)
delta = time() - start
total_walk_time = time() - start_walk

mean_walks_length = np.mean([len(walk) for walk in walks])

median_walks_length = np.median([len(walk) for walk in walks])

degrees = [graph.degree(node) for node in range(graph.get_nodes_number())]
Esempio n. 16
0
def load_graph_ensmallen(edge_path: str,
                         nodes_number: int,
                         edges_number: int,
                         has_weights: bool,
                         fast: bool = False,
                         cache_size: float = None,
                         **kwargs: Dict) -> EnsmallenGraph:
    """Load graph object using EnsmallenGraph.

    Parameters
    -----------------------
    edge_path: str,
        Path from where to load the edgelist.
        File is expected to be in directed fashion and sorted.
        The node IDs will be extracted from the numeric node IDs of the graph.
        The file is expected to be without header and the first column
        is expected to be the sources, while the second is expected to be
        the destinations. The third column, optionally, is expected to
        contain the weights if they are present in the considered graph.
    nodes_number: int,
        Upper bound of nodes number present in the graph.
        The closer the number is to the actual number of nodes of the graph
        the better the compression performance are going to be.
    edges_number: int,
        Upper bound of edges number present in the graph.
        The closer the number is to the actual number of edges of the graph
        the better the compression performance are going to be.
    has_weights: bool,
        Wether the graph has weights and we should load them.
        The weights, if present, are expected to be in column 3.
    fast: bool = False,
        Wether to run the fast version that uses more memory.
    **kwargs: Dict,
        Additional parameters that are used in other libraries but not this one.

    Returns
    -------------------------
    The loaded graph.
    """
    directed_edge_list = edges_number > 1_000_000
    graph: EnsmallenGraph = EnsmallenGraph.from_sorted_csv(
        build_directed_path(edge_path, directed=not directed_edge_list),
        directed=False,
        directed_edge_list=directed_edge_list,
        nodes_number=nodes_number,
        edges_number=edges_number,
        sources_column_number=0,
        destinations_column_number=1,
        **(dict(weights_column_number=2) if has_weights else {}),
        numeric_node_ids=True,
        verbose=False,
        edge_header=False)

    if fast:
        graph.enable_fast_walk()

    if cache_size is not None:
        graph.enable_fast_walk(vector_destinations=False,
                               vector_outbounds=False,
                               cache_size=cache_size)

    return graph