def plot_embedding( graph: EnsmallenGraph, tsne_embedding: np.ndarray, k: int = 10, axes: Axes = None ): if axes is None: _, axes = plt.subplots(figsize=(5, 5)) if graph.node_types_mapping is None: node_types = np.zeros(graph.get_nodes_number(), dtype=np.uint8) common_node_types_names = ["No node type provided"] else: nodes, node_types = graph.get_top_k_nodes_by_node_type(k) tsne_embedding = tsne_embedding[nodes] common_node_types_names = list(np.array(graph.node_types_reverse_mapping)[np.unique(node_types)]) colors = list(TABLEAU_COLORS.keys())[:len(common_node_types_names)] scatter = axes.scatter( *tsne_embedding.T, s=0.25, c=node_types, cmap=ListedColormap(colors) ) axes.legend( handles=scatter.legend_elements()[0], labels=common_node_types_names ) return axes
def test_load_ppi() -> EnsmallenGraph: """Test that PPI can be loaded.""" EnsmallenGraph.from_unsorted_csv( edge_path=os.path.join(ROOT_DIR, "data/ppi/edges.tsv"), sources_column="subject", destinations_column="object", directed=False, weights_column="weight", node_path="./pytests/data/ppi/nodes.tsv", nodes_column="id", node_types_column="molecular_function", default_node_type="Missing" )
def test_no_existent_column(): with pytest.raises(ValueError): EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/edges.tsv", sources_column="subject", destinations_column="NOT A REAL COLUMN", directed=False, edge_types_column="edge_label", node_path="./pytests/data/nodes.tsv", nodes_column="id", node_types_column="category", default_edge_type='biolink:interacts_with', default_node_type='biolink:NamedThing' ) with pytest.raises(ValueError): EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/edges.tsv", sources_column="subject", destinations_column="", directed=False, edge_types_column="NOT A REAL COLUMN", node_path="./pytests/data/nodes.tsv", nodes_column="id", node_types_column="category", default_edge_type='biolink:interacts_with', default_node_type='biolink:NamedThing' ) with pytest.raises(ValueError): EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/edges.tsv", sources_column="subject", destinations_column="", directed=False, edge_types_column="edge_label", node_path="./pytests/data/nodes.tsv", nodes_column="id", node_types_column="NOT A REAL COLUMN", default_edge_type='biolink:interacts_with', default_node_type='biolink:NamedThing' ) with pytest.raises(ValueError): EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/edges.tsv", sources_column="subject", destinations_column="object", directed=False, edge_types_column="edge_label", node_path="./pytests/data/nodes.tsv", nodes_column="NOT A REAL COLUMN", node_types_column="category", default_edge_type='biolink:interacts_with', default_node_type='biolink:NamedThing' )
def load_pathway() -> EnsmallenGraph: """Test that Pathway can be loaded.""" graph = EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/pathway.tsv", sources_column="Gene_A", destinations_column="Gene_B", directed=False, name="Pathway") graph.enable() return graph
def plot_embedding_degrees_heatmap( graph: EnsmallenGraph, embedding: np.ndarray, axes: Axes = None, fig: Figure = None, max_degree=10 ): if axes is None: fig, axes = plt.subplots(figsize=(10, 10), dpi=200) cm = plt.cm.get_cmap('RdYlBu') degrees = graph.degrees() degrees[degrees > max_degree] = min(max_degree, degrees.max()) sc = axes.scatter(*embedding.T, c=degrees, s=0.1, cmap=cm) fig.colorbar(sc, ax=axes)
def load_hpo() -> EnsmallenGraph: """Test that HPO graph can be loaded.""" graph = EnsmallenGraph.from_unsorted_csv( edge_path="./pytests/data/edges.tsv", sources_column="subject", destinations_column="object", directed=False, edge_types_column="edge_label", node_path="./pytests/data/nodes.tsv", nodes_column="id", node_types_column="category", default_edge_type='biolink:interacts_with', default_node_type='biolink:NamedThing', name="HPO") graph.enable() return graph
def execute_walks_ensmallen(graph: EnsmallenGraph, length: int, iterations: int, nodes_number: int, max_degree: int, p: float = 1.0, q: float = 1.0, **kwargs: Dict) -> np.ndarray: """Execute first/second order walks using Ensmallen walker. Parameters -------------------------- graph: EnsmallenGraph, The graph on which to run the walks. length: int, Lenght of the walks. iterations: int, Number of walks to start from each node. nodes_number: int, Number of nodes in the graph. max_degree: int, Maximum degree of the graph. p: float = 1.0, Inverse weight for making the walk local. By default, the walk will be uniform. q: float = 1.0, Inverse weight for making the walk a deep first. By default, the walk will be uniform. kwargs: Dict, Additional parameters to be used in other libraries but not this one. Returns -------------------------- Computed walks as numpy array. """ return graph.complete_walks(length=length, iterations=iterations, return_weight=1 / p, explore_weight=1 / q, max_neighbours=10_100 if max_degree > 10_000 and nodes_number > 500_000 else None)
def load_graph(node_type_column: str, default_node_type: str = "type"): """Load the graph with the specified node type column. Parameters ------------------------ node_type_column:str, The column to be loaded. default_node_type:str="unknown", The default value to use when no node type is available for a given node. """ return EnsmallenGraph.from_csv( edge_path="/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_intindex_head.txt", # edge_path="/global/homes/m/marcinj/data/ENIGMA/masterG.edgelist_col12_head.tsv", sources_column="subject", destinations_column="object", directed=False, # weights_column="weight", node_path="/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_nodes_meta_header.txt", # node_path="/global/homes/m/marcinj/data/ENIGMA/masterG.edgelist_col12_nodes_meta_header.txt", nodes_column="id", node_types_column=node_type_column, default_node_type=default_node_type )
c for c in go_yaml['classifier']['classifiers'] if c['type'] == 'neural network' ][0]['model']['outfile'] mlp = tf.keras.models.load_model(os.path.join("output_data", mlp_file)) node_data = pd.read_csv('input_data/go_nodes.tsv', sep='\t') node_data = node_data.filter(['id', 'name']) # # positive validation edges # pos_graph_args = go_yaml['graph_data']['graph'] pos_graph_args['directed'] = True pos_graph_args['edge_path'] = go_yaml['graph_data']['pos_validation'][ 'edge_path'] pos_validation_graph = EnsmallenGraph.from_unsorted_csv(**pos_graph_args) pos_edges = list( zip(pos_validation_graph.get_source_names(), pos_validation_graph.get_destination_names())) pos_edge_transform = GraphTransformer(go_yaml['classifier']['edge_method']) pos_edge_transform.fit( np.load( os.path.join("output_data", go_yaml['embeddings']['embedding_file_name']))) pos_edges_to_eval_emb = pos_edge_transform.transform(pos_validation_graph) pos_valid_predict = mlp.predict(pos_edges_to_eval_emb, batch_size=1048) pos_valid_predict_sorted = pd.DataFrame({ "pred": pos_valid_predict.flatten(), "subject": [t[0] for t in pos_edges],
from time import time from humanize import naturaldelta from ensmallen_graph import EnsmallenGraph import compress_json import json start = time() graph = EnsmallenGraph(edge_path="../embiggen/pos_train_edges.tsv", sources_column="subject", destinations_column="object", directed=True, edge_types_column="edge_label", node_path="../embiggen/pos_train_nodes.tsv", nodes_column="id", node_types_column="category") completed_graph = time() - start start_walk = time() graph.walk(iterations=10, length=80, min_length=0, return_weight=1, explore_weight=1, change_node_type_weight=1, change_edge_type_weight=1) delta = time() - start total_walk_time = time() - start_walk response = { "required_time": delta, "human_time": naturaldelta(delta), "building_graph_required_time": completed_graph,
from tqdm.auto import tqdm import numpy as np from ensmallen_graph import EnsmallenGraph from embiggen import GraphTransformer, EdgeTransformer #try: # from tsnecuda import TSNE #except ModuleNotFoundError: from MulticoreTSNE import MulticoreTSNE as TSNE embedding_path = "./FOURTH/SkipGram_embedding.npy" graph = EnsmallenGraph.from_csv( #/global/homes/m/marcinj/data/ENIGMA edge_path= "/global/scratch/marcin/N2V/MicrobeEnvironmentGraphLearn/ENIGMA_data/masterG.edgelist_col12_head.tsv", sources_column="subject", destinations_column="object", directed=False) negative_graph = graph.sample_negatives(42, graph.get_edges_number(), False) embedding = np.load(embedding_path) for method in tqdm(EdgeTransformer.methods, desc="Methods", leave=False): tsne_path = f"tsne_edges_microbeenv" if os.path.exists(tsne_path): continue transformer = GraphTransformer(method) transformer.fit(embedding) positive_edges = transformer.transform(graph)
def make_holdouts(nodes: str, edges: str, output_dir: str, train_fraction: float, validation: bool, seed=42) -> None: """Prepare positive and negative edges for testing and training (see run.py holdouts command for documentation) Args: :param nodes nodes of input graph, in KGX TSV format [data/merged/nodes.tsv] :param edges: edges for input graph, in KGX TSV format [data/merged/edges.tsv] :param output_dir: directory to output edges and new graph [data/edges/] :param train_fraction: fraction of edges to emit as training :param validation: should we make validation edges? [False] :param seed: random seed [42] Returns: None. """ logging.basicConfig(level=logging.INFO) logging.info("Loading graph from nodes %s and edges %s files" % (nodes, edges)) graph = EnsmallenGraph.from_unsorted_csv( edge_path=edges, sources_column='subject', destinations_column='object', directed=False, edge_types_column='edge_label', default_edge_type='biolink:Association', node_path=nodes, nodes_column='id', default_node_type='biolink:NamedThing', node_types_column='category') os.makedirs(output_dir, exist_ok=True) # make positive edges logging.info("Making positive edges") pos_train_edges, pos_test_edges = graph.random_holdout( seed=seed, train_percentage=train_fraction) if validation: pos_valid_edges, pos_test_edges = \ pos_test_edges.random_holdout(seed=seed, train_percentage=0.5) # make negative edges logging.info("Making negative edges") all_negative_edges = \ pos_train_edges.sample_negatives(seed=seed, negatives_number=graph.get_edges_number(), allow_selfloops=False) neg_train_edges, neg_test_edges = \ all_negative_edges.random_holdout(seed=seed, train_percentage=train_fraction) if validation: neg_test_edges, neg_valid_edges = \ neg_test_edges.random_holdout(seed=seed, train_percentage=0.5) # # write out positive edges # # training: logging.info("Writing out positive edges") pos_train_edges_outfile = os.path.join(output_dir, "pos_train_edges.tsv") pos_train_nodes_outfile = os.path.join(output_dir, "pos_train_nodes.tsv") pos_test_edges_outfile = os.path.join(output_dir, "pos_test_edges.tsv") pos_valid_edges_outfile = os.path.join(output_dir, "pos_valid_edges.tsv") pos_train_edges.dump_edges(path=pos_train_edges_outfile) pos_train_edges.dump_nodes(path=pos_train_nodes_outfile) pos_test_edges.dump_edges(path=pos_test_edges_outfile) if validation: pos_valid_edges.dump_edges(path=pos_valid_edges_outfile) # # write out negative edges # logging.info("Writing out negative edges") neg_train_edges_outfile = os.path.join(output_dir, "neg_train_edges.tsv") neg_test_edges_outfile = os.path.join(output_dir, "neg_test_edges.tsv") neg_valid_edges_outfile = os.path.join(output_dir, "neg_valid_edges.tsv") neg_train_edges.dump_edges(path=neg_train_edges_outfile) neg_test_edges.dump_edges(path=neg_test_edges_outfile) if validation: neg_valid_edges.dump_edges(path=neg_valid_edges_outfile)
def sanitize_graph(graph_data: str, root: str): """Convert all the graphs to a standard format. Parameters ---------- graph_data: List[Dict], Informations of the graph to sanitize root: str, The working folder. All the files will be read and written from here. """ kwargs = graph_data["loading_settings"] kwargs["edge_path"] = os.path.join(root, graph_data["folder_name"], graph_data["edge_file"]) kwargs.setdefault("directed", False) directed_dst_path = os.path.join(root, graph_data["folder_name"], "directed_sanitized.tsv") undirected_dst_path = os.path.join(root, graph_data["folder_name"], "undirected_sanitized.tsv") report_path = os.path.join(root, graph_data["folder_name"], "report.json") textual_report_path = os.path.join(root, graph_data["folder_name"], "report.txt") if all( os.path.exists(p) for p in (directed_dst_path, undirected_dst_path, report_path, textual_report_path)): return logger.info("Loading the file %s" % kwargs["edge_path"]) graph: EnsmallenGraph = EnsmallenGraph.from_unsorted_csv( **kwargs, name=graph_data["graph"]) logger.info("Enabling fast version") graph.enable_fast_walk() logger.info("Computing metadata") if not os.path.exists(report_path): logger.info("Computing JSON report") report = graph.report() compress_json.dump(report, report_path) if not os.path.exists(textual_report_path): logger.info("Computing textual report") textual_report = str(graph) with open(textual_report_path, "w") as f: f.write(textual_report) if not os.path.exists(undirected_dst_path): logger.info("Writing the file {}".format(undirected_dst_path)) graph.dump_edges( path=undirected_dst_path, header=False, sources_column_number=0, destinations_column_number=1, weights_column_number=2, numeric_node_ids=True, # We dump with directed=True for the undirected file to have in the file the bidirectional edges. directed=True) if not os.path.exists(directed_dst_path): logger.info("Writing the file {}".format(directed_dst_path)) graph.dump_edges( path=directed_dst_path, header=False, sources_column_number=0, destinations_column_number=1, weights_column_number=2, numeric_node_ids=True, # We dump with directed=False for the directed file to have no doubled bidirectional edge in the write out. directed=False)
if not os.path.exists(hpo_json_file): os.system("wget http://purl.obolibrary.org/obo/hp.json -O hpo.json") if not os.path.exists(hpo_edges_file) or not os.path.exists(hpo_nodes_file): os.system("kgx transform --input-format obojson --output-format tsv --output hpo hpo.json") edge = 'biolink:subclass_of' edges_string = edge os.makedirs(edges_string, exist_ok=True) graph = EnsmallenGraph.from_unsorted_csv( edge_path=hpo_edges_file, sources_column="subject", destinations_column="object", edge_types_column='edge_label', directed=False, node_path=hpo_nodes_file, nodes_column='id', node_types_column='category', default_node_type='biolink:NamedThing' ) reduced_graph = graph.remove(singletons=True) pos_training, pos_validation = reduced_graph.connected_holdout( train_size=train_percentage, edge_types=[edge], random_state=seed) # make negative graph neg_training, neg_validation = reduced_graph.sample_negatives( random_state=seed,
from time import time from humanize import naturaldelta from ensmallen_graph import EnsmallenGraph # pylint: disable=no-name-in-module import compress_json import json import numpy as np start = time() graph = EnsmallenGraph(edge_path="../graph/cooccurrence/edges.tsv", sources_column="subject", destinations_column="object", directed=False, validate_input_data=True) completed_graph = time() - start start_walk = time() walks = graph.walk(iterations=1, length=80, return_weight=1, explore_weight=1, change_node_type_weight=1, change_edge_type_weight=1) delta = time() - start total_walk_time = time() - start_walk mean_walks_length = np.mean([len(walk) for walk in walks]) median_walks_length = np.median([len(walk) for walk in walks]) degrees = [graph.degree(node) for node in range(graph.get_nodes_number())]
def load_graph_ensmallen(edge_path: str, nodes_number: int, edges_number: int, has_weights: bool, fast: bool = False, cache_size: float = None, **kwargs: Dict) -> EnsmallenGraph: """Load graph object using EnsmallenGraph. Parameters ----------------------- edge_path: str, Path from where to load the edgelist. File is expected to be in directed fashion and sorted. The node IDs will be extracted from the numeric node IDs of the graph. The file is expected to be without header and the first column is expected to be the sources, while the second is expected to be the destinations. The third column, optionally, is expected to contain the weights if they are present in the considered graph. nodes_number: int, Upper bound of nodes number present in the graph. The closer the number is to the actual number of nodes of the graph the better the compression performance are going to be. edges_number: int, Upper bound of edges number present in the graph. The closer the number is to the actual number of edges of the graph the better the compression performance are going to be. has_weights: bool, Wether the graph has weights and we should load them. The weights, if present, are expected to be in column 3. fast: bool = False, Wether to run the fast version that uses more memory. **kwargs: Dict, Additional parameters that are used in other libraries but not this one. Returns ------------------------- The loaded graph. """ directed_edge_list = edges_number > 1_000_000 graph: EnsmallenGraph = EnsmallenGraph.from_sorted_csv( build_directed_path(edge_path, directed=not directed_edge_list), directed=False, directed_edge_list=directed_edge_list, nodes_number=nodes_number, edges_number=edges_number, sources_column_number=0, destinations_column_number=1, **(dict(weights_column_number=2) if has_weights else {}), numeric_node_ids=True, verbose=False, edge_header=False) if fast: graph.enable_fast_walk() if cache_size is not None: graph.enable_fast_walk(vector_destinations=False, vector_outbounds=False, cache_size=cache_size) return graph