def load_graph(self) -> Graph: """ Loads graph nodes and edges into Ensmallen. Creates a node type list, as Ensmallen requires this to parse node types. :param graph_args: dict, output of main_graph_args :return: ensmallen Graph """ graph_args_with_indir = self.main_graph_args() for pathtype in ['node_path', 'edge_path']: filepath = graph_args_with_indir[pathtype] if is_url(filepath): url_as_filename = \ ''.join(c if c in VALID_CHARS else "_" for c in filepath) outfile = os.path.join(self.outdir(), url_as_filename) download_file(filepath, outfile) graph_args_with_indir[pathtype] = outfile elif not is_valid_path(filepath): raise FileNotFoundError(f"Please check path: {filepath}") # Now load the Ensmallen graph loaded_graph = Graph.from_csv(**graph_args_with_indir) return loaded_graph
def make_link_prediction_data(self, embedding_file: str, training_graph_args: dict, pos_validation_args: dict, neg_training_args: dict, neg_validation_args: dict, edge_method: str) -> Tuple[Tuple, Tuple]: """Prepare training and validation data for training link prediction classifers Args: embedding_file: path to embedding file for nodes in graph training_graph_args: EnsmallenGraph arguments to load training graph pos_validation_args: EnsmallenGraph arguments to load positive validation graph neg_training_args: EnsmallenGraph arguments to load negative training graph neg_validation_args: EnsmallenGraph arguments to load negative validation graph edge_method: edge embedding method to use (average, L1, L2, etc) Returns: A tuple of tuples """ embedding = pd.read_csv(embedding_file, index_col=0, header=None) # load graphs graphs = {'pos_training': Graph.from_csv(**training_graph_args)} for name, graph_args in [('pos_validation', pos_validation_args), ('neg_training', neg_training_args), ('neg_validation', neg_validation_args)]: these_params = copy.deepcopy(training_graph_args) these_params.update(graph_args) graphs[name] = Graph.from_csv(**these_params) # create transformer object to convert graphs into edge embeddings lpt = LinkPredictionTransformer(method=edge_method) lpt.fit(embedding ) # pass node embeddings to be used to create edge embeddings train_edges, train_labels = lpt.transform( positive_graph=graphs['pos_training'], negative_graph=graphs['neg_training']) valid_edges, valid_labels = lpt.transform( positive_graph=graphs['pos_validation'], negative_graph=graphs['neg_validation']) return (train_edges, train_labels), (valid_edges, valid_labels)
def test_make_tsne(self): yhelp = YamlHelper( "tests/resources/test_graph_embedding_bert_tsne.yaml") g = Graph.from_csv(nodes_column="id", node_list_node_types_column="category", default_node_type="biolink:NamedThing", node_path=os.path.join( yhelp.yaml['input_directory'], yhelp.yaml['graph_data']['graph']['node_path']), edge_path=os.path.join( yhelp.yaml['input_directory'], yhelp.yaml['graph_data']['graph']['edge_path']), sources_column="subject", destinations_column="object", directed=False) tsne_kwargs = yhelp.make_tsne_args(graph=g) tsne_kwargs['embedding_file'] = 'tests/resources/test_embeddings.tsv' make_tsne(**tsne_kwargs) self.assertTrue(os.path.exists(self.expected_tsne_file))
def make_node_embeddings( embedding_outfile: str, embedding_history_outfile: str, main_graph_args: dict, node_embedding_params: dict, bert_columns: dict, bert_pretrained_model: str = "allenai/scibert_scivocab_uncased" ) -> None: """Make embeddings and output embeddings and training history Args: embedding_outfile: outfile to write out embeddings embedding_history_outfile: outfile to write out training history main_graph_args: arguments passed to ensmallen_graph for graph loading node_embedding_params: args passed to compute_node_embeddings() in Embiggen bert_columns: columns containing text info to use to make embeddings from Bert pretrained embeddings Returns: None. """ # load main graph graph: Graph = Graph.from_csv(**main_graph_args) node_embedding, training_history = compute_node_embedding( graph, **node_embedding_params) # embed columns with BERT first (if we're gonna) bert_embeddings = pd.DataFrame() if bert_columns: bert_model = BertModel.from_pretrained(bert_pretrained_model, output_hidden_states=True) bert_tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model) bert_model.eval() all_bert_embeddings = bert_model.embeddings.word_embeddings.weight.data.numpy( ) node_data = get_node_data(main_graph_args['node_path']) node_text = [ " ".join([str(row[col]) for col in bert_columns]) for index, row in tqdm( node_data.iterrows(), "extracting text from nodes") ] node_text_tokenized = [ bert_tokenizer.encode( this_text, # Sentence to encode # add_special_tokens=True, # Add '[CLS]' and '[SEP]' return_tensors='np') for this_text in tqdm(node_text, "tokenzing text") ] node_text_tensors = [ np.mean(all_bert_embeddings[ids.flatten()], axis=0) for ids in tqdm(node_text_tokenized, "extracting embeddings for tokens") ] bert_embeddings = pd.DataFrame(node_text_tensors, index=graph.get_node_names()) if not bert_embeddings.empty: node_embedding = pd.concat([node_embedding, bert_embeddings], axis=1, ignore_index=False) if not training_history.empty: with open(embedding_history_outfile, 'w') as f: f.write(training_history.to_json()) node_embedding.to_csv(embedding_outfile, header=False) return None
def __call__(self) -> Graph: """Return Graph containing required graph.""" graph_arguments = self.get_graph_arguments() root = self.get_preprocessed_graph_directory_path() if not self._cache and os.path.exists(root): shutil.rmtree(root) self.download() os.makedirs(root, exist_ok=True) # Call the provided callbacks to process the edge lists, if any. for callback, arguments in zip(self._callbacks, self._callbacks_arguments): callback(**{ key: os.path.join(self._cache_path, value) if key.endswith("_path") else value for key, value in arguments.items() }) # Preprocess the edge list to an optimal edge list # if this is enabled. if self._preprocess: # If any of the node types columns have been provided, # we compute the target node types column target_node_type_list_path = None if any( graph_arguments.get(column) is not None for column in ( "node_list_node_types_column_number", "node_list_node_types_column", ) ): target_node_type_list_path = self.get_preprocessed_graph_node_types_path() # If any of the edge types columns have been provided, # we compute the target edge types column target_edge_type_list_path = None if any( graph_arguments.get(column) is not None for column in ( "edge_list_edge_types_column_number", "edge_list_edge_types_column", ) ): target_edge_type_list_path = self.get_preprocessed_graph_edge_types_path() target_node_path = self.get_preprocessed_graph_nodes_path() target_edge_path = self.get_preprocessed_graph_edges_path() # If a node path was specified node_path = self.get_adjusted_graph_nodes_path() may_have_singletons = graph_arguments.get( "may_have_singletons", True ) and node_path is not None if not self.is_preprocessed(): try: ( node_types_number, nodes_number, edge_types_number, edges_number ) = edge_list_utils.build_optimal_lists_files( # NOTE: the following parameters are supported by the parser, but # so far we have not encountered a single use case where we actually used them. # original_node_type_path, # original_node_type_list_separator, # original_node_types_column_number, # original_node_types_column, # original_numeric_node_type_ids, # original_minimum_node_type_id, # original_node_type_list_header, # original_node_type_list_support_balanced_quotes, # original_node_type_list_rows_to_skip, # original_node_type_list_max_rows_number, # original_node_type_list_comment_symbol, # original_load_node_type_list_in_parallel, # original_node_type_list_is_correct, # node_types_number, target_node_type_list_path=target_node_type_list_path, target_node_type_list_separator='\t', target_node_type_list_node_types_column_number=0, original_node_path=node_path, original_node_list_header=graph_arguments.get( "node_list_header" ), original_node_list_support_balanced_quotes=graph_arguments.get( "node_list_support_balanced_quotes" ), node_list_rows_to_skip=graph_arguments.get( "node_list_rows_to_skip" ), node_list_is_correct=graph_arguments.get( "node_list_is_correct" ), node_list_max_rows_number=graph_arguments.get( "node_list_max_rows_number" ), node_list_comment_symbol=graph_arguments.get( "node_list_comment_symbol" ), default_node_type=graph_arguments.get( "default_node_type" ), original_nodes_column_number=graph_arguments.get( "nodes_column_number" ), original_nodes_column=graph_arguments.get( "nodes_column" ), original_node_types_separator=graph_arguments.get( "node_types_separator" ), original_node_list_separator=graph_arguments.get( "node_list_separator" ), original_node_list_node_types_column_number=graph_arguments.get( "node_list_node_types_column_number" ), original_node_list_node_types_column=graph_arguments.get( "node_list_node_types_column" ), nodes_number=graph_arguments.get("nodes_number"), # original_minimum_node_id, # original_numeric_node_ids, # original_node_list_numeric_node_type_ids, original_skip_node_types_if_unavailable=True, # It make sense to load the node list in parallel only when # you have to preprocess the node types, since otherwise the nodes number # would be unknown. original_load_node_list_in_parallel=target_node_type_list_path is not None, maximum_node_id=graph_arguments.get( "maximum_node_id" ), target_node_path=target_node_path, target_node_list_separator='\t', target_nodes_column=graph_arguments.get( "nodes_column" ), target_nodes_column_number=0, target_node_list_node_types_column_number=1, target_node_types_separator="|", # original_edge_type_path, # original_edge_type_list_separator, # original_edge_types_column_number, # original_edge_types_column, # original_numeric_edge_type_ids, # original_minimum_edge_type_id, # original_edge_type_list_header, # edge_type_list_rows_to_skip, # edge_type_list_max_rows_number, # edge_type_list_comment_symbol, # load_edge_type_list_in_parallel=True, # edge_type_list_is_correct, # edge_types_number, target_edge_type_list_path=target_edge_type_list_path, target_edge_type_list_separator='\t', target_edge_type_list_edge_types_column_number=0, original_edge_path=os.path.join( self._cache_path, graph_arguments["edge_path"]), original_edge_list_header=graph_arguments.get( "edge_list_header" ), original_edge_list_support_balanced_quotes=graph_arguments.get( "edge_list_support_balanced_quotes" ), original_edge_list_separator=graph_arguments.get( "edge_list_separator" ), original_sources_column_number=graph_arguments.get( "sources_column_number" ), original_sources_column=graph_arguments.get( "sources_column" ), original_destinations_column_number=graph_arguments.get( "destinations_column_number" ), original_destinations_column=graph_arguments.get( "destinations_column" ), original_edge_list_edge_types_column_number=graph_arguments.get( "edge_list_edge_types_column_number" ), original_edge_list_edge_types_column=graph_arguments.get( "edge_list_edge_types_column" ), default_edge_type=graph_arguments.get( "default_edge_type" ), original_weights_column_number=graph_arguments.get( "weights_column_number" ), original_weights_column=graph_arguments.get( "weights_column" ), default_weight=graph_arguments.get( "default_weight" ), original_edge_list_numeric_node_ids=graph_arguments.get( "edge_list_numeric_node_ids" ), skip_weights_if_unavailable=graph_arguments.get( "skip_weights_if_unavailable" ), skip_edge_types_if_unavailable=graph_arguments.get( "skip_edge_types_if_unavailable" ), edge_list_comment_symbol=graph_arguments.get( "edge_list_comment_symbol" ), edge_list_max_rows_number=graph_arguments.get( "edge_list_max_rows_number" ), edge_list_rows_to_skip=graph_arguments.get( "edge_list_rows_to_skip" ), load_edge_list_in_parallel=True, edges_number=graph_arguments.get("edges_number"), target_edge_path=target_edge_path, target_edge_list_separator='\t', sort_temporary_directory=self._sort_tmp_dir, directed=self._directed, verbose=self._verbose > 0, name=self._name, ) except Exception as e: raise RuntimeError( f"Something went wrong while preprocessing the graph {self._name}, " f"version {self._version}, " f"retrieved from the {self._repository} repository. " "This is NOT the loading step, but a preprocessing step " "that loads remote data from third parties. " "As such there may have been some changes in the remote data " "that may have made them incompatible with the current " "expected parametrization. " "Do open up an issue in the Ensmallen's GitHub repository reporting also the complete" "exception of this error to help us keep the automatic graph retrieval " "in good shape. Thank you!" ) from e # Store the obtained metadata self.store_preprocessed_metadata( node_types_number, nodes_number, edge_types_number, edges_number ) # Load the stored metadata metadata = self.get_preprocessed_metadata() # If the node types are provided has_node_types = metadata["node_types_number"] is not None if has_node_types and self._load_node_types: node_types_arguments = { "node_type_path": target_node_type_list_path, "node_types_column_number": 0, "node_type_list_is_correct": True, "node_type_list_separator": "\t", "node_types_separator": "|", "node_list_node_types_column_number": 1, "node_list_numeric_node_type_ids": True, "skip_node_types_if_unavailable": True, } else: node_types_arguments = {} # If the nodes are to be loaded if self._load_nodes: nodes_arguments = { "node_path": target_node_path, "node_list_separator": "\t", "nodes_column_number": 0, "node_list_is_correct": True, **node_types_arguments } else: nodes_arguments = { "numeric_node_ids": True, } # If the edge types are provided has_edge_types = metadata["edge_types_number"] is not None if has_edge_types: edge_types_arguments = { "edge_type_path": target_edge_type_list_path, "edge_types_column_number": 0, "edge_type_list_is_correct": True, "edge_type_list_separator": "\t", "edge_list_edge_types_column_number": 2, "edge_list_numeric_edge_type_ids": True, "skip_edge_types_if_unavailable": True, } else: edge_types_arguments = {} has_edge_weights = any( column in graph_arguments for column in ( "weights_column_number", "weights_column", "default_weight" ) ) if has_edge_weights and self._load_edge_weights: edge_weights_arguments = { "weights_column_number": 2 + int(metadata["edge_types_number"] is not None), "skip_weights_if_unavailable": True, } else: edge_weights_arguments = {} try: # Load the graph graph = Graph.from_csv(**{ **metadata, **nodes_arguments, **edge_types_arguments, **edge_weights_arguments, "edge_path": target_edge_path, "edge_list_header": False, "sources_column_number": 0, "destinations_column_number": 1, "edge_list_numeric_node_ids": True, "edge_list_is_complete": True, "edge_list_may_contain_duplicates": False, "edge_list_is_sorted": True, "edge_list_is_correct": True, "edges_number": metadata["edges_number"], "nodes_number": metadata["nodes_number"], "may_have_singletons": may_have_singletons, "verbose": self._verbose > 0, "directed": self._directed, "name": self._name, }) except Exception as e: raise RuntimeError( f"Something went wrong while loading the graph {self._name}, " f"version {self._version}, " f"retrieved from the {self._repository} repository. " "Do note that the preprocessing step of the graph has " "completed without apparent errors. " "This is likely something wrong with the Ensmallen library " "so do please open an issue about the error you have encountered " "in the Ensmallen's GitHub repository reporting also the complete " "exception of this error. Thank you!" ) from e else: # Otherwise just load the graph. graph = Graph.from_csv(**{ **{ key: os.path.join(self._cache_path, value) if key.endswith("_path") else value for key, value in graph_arguments.items() }, "directed": self._directed, "verbose": self._verbose > 0, "name": self._name, **self._graph_kwargs, }) if self._auto_enable_tradeoffs and graph.get_number_of_unique_edges() < 50e6: graph.enable() return graph
def __call__(self) -> Graph: """Return Graph containing required graph.""" graph_arguments = self.get_graph_arguments() root = self.get_preprocessed_graph_directory_path() if not self._cache and os.path.exists(root): shutil.rmtree(root) paths = self.get_adjusted_graph_paths() if not os.path.exists(root): # Download the necessary data self._downloader.download( self._graph["urls"], paths ) os.makedirs(root, exist_ok=True) node_type_list_path = self.get_preprocessed_graph_node_types_path() edge_type_list_path = self.get_preprocessed_graph_edge_types_path() node_path = self.get_preprocessed_graph_nodes_path() edge_path = self.get_preprocessed_graph_edges_path() if not self.is_preprocessed(): ( node_types_number, nodes_number, edges_number ) = edge_list_utils.parse_wikipedia_graph( source_path=paths[0].replace(".bz2", ""), edge_path=edge_path, node_path=node_path, node_type_path=node_type_list_path, edge_type_path=edge_type_list_path, node_list_separator="\t", node_type_list_separator="\t", edge_type_list_separator="\t", node_types_separator="|", nodes_column="node_names", node_types_column="node_type_names", node_list_node_types_column="node_type_names", edge_types_column="edge_type_names", node_descriptions_column="node_descriptions", edge_list_separator="\t", keep_nodes_without_descriptions=self._keep_nodes_without_descriptions, keep_nodes_without_categories=self._keep_nodes_without_categories, keep_interwikipedia_nodes=self._keep_interwikipedia_nodes, keep_external_nodes=self._keep_external_nodes, compute_node_description=self._compute_node_description, sort_temporary_directory=self._sort_tmp_dir, directed=self._directed, verbose=self._verbose > 0, ) # Store the obtained metadata self.store_preprocessed_metadata( node_types_number, nodes_number, None, edges_number ) # Load the stored metadata metadata = self.get_preprocessed_metadata() # If the node types are provided if self._load_node_types: node_types_arguments = { "node_type_path": node_type_list_path, "node_types_number": metadata["node_types_number"], "node_types_column": "node_type_names", "node_type_list_is_correct": True, "node_type_list_separator": "\t", "node_types_separator": "|", "node_list_node_types_column_number": 1, "node_list_numeric_node_type_ids": True, } else: node_types_arguments = {} # If the nodes are to be loaded if self._load_nodes: nodes_arguments = { "node_path": node_path, "node_list_separator": "\t", "nodes_column": "node_names", "node_list_is_correct": True, **node_types_arguments } else: nodes_arguments = { "numeric_node_ids": True, } # If the edge types are provided edge_types_arguments = { "edge_type_path": edge_type_list_path, "edge_types_number": metadata["edge_types_number"], "edge_types_column_number": 0, "edge_type_list_is_correct": True, "edge_type_list_separator": "\t", "edge_list_edge_types_column_number": 2, "edge_list_numeric_edge_type_ids": True } # Load the graph graph = Graph.from_csv(**{ **metadata, **graph_arguments, **nodes_arguments, **edge_types_arguments, "edge_path": edge_path, "edge_list_header": False, "sources_column_number": 0, "destinations_column_number": 1, "edge_list_numeric_node_ids": True, "edge_list_is_complete": True, "edge_list_may_contain_duplicates": False, "edge_list_is_sorted": True, "edge_list_is_correct": True, "edges_number": metadata["edges_number"], "nodes_number": metadata["nodes_number"], "may_have_singletons": True, "verbose": self._verbose > 0, "directed": self._directed, "name": self._name, }) if self._auto_enable_tradeoffs and graph.get_number_of_unique_edges() < 50e6: graph.enable() return graph