def bench_load_graph(library: str, graph_name: str, metadata_path: str, root: str, seconds: int): """Benches loading the given graph using given library. Parameters ----------------------- library: str, Library to use for the benchmark. graph_name: str, Graph to use for the benchmark. metadata_path: str, Path from where to load the graph metadata. root: str, Directory from where to load the graph. seconds: int, Number of seconds to wait for after a successfull execution. """ validate_graph_and_library(library, graph_name, metadata_path) metadata = compress_json.load(metadata_path) if "disabled" in metadata: return data = metadata[graph_name] report = get_graph_report(data, root) log_path = "results/{graph}/{library}/load_graph.csv".format( root=root, graph=graph_name, library=library) if os.path.exists(log_path): return with Tracker(log_path): load_graph(library, data, root, report) wait_k_seconds(seconds)
def get_preprocessed_metadata(self) -> Dict: """Return the stored metadata. Returns -------------------------------- Dictionary with the metadata. """ return compress_json.load( self.get_preprocessed_graph_metadata_path() )
def test_plot_small_history(): plot_history(pd.read_json("tests/small_history.json"), path="plots/small_history.png") plt.close() assert os.path.exists("plots/small_history.png") plot_history("tests/small_history.json") plt.close() plot_history(["tests/small_history.json", "tests/small_history.json"]) plt.close() plot_history(compress_json.load("tests/small_history.json")) plt.close()
def bench_first_order_walks( library: str, graph_name: str, metadata_path: str, root: str, seconds: int, length: int = 100, iterations: int = 1, ): """Benches executing random walks the given graph using given library. Parameters ----------------------- library: str, Library to use for the benchmark. graph_name: str, Graph to use for the benchmark. metadata_path: str, Path from where to load the graph metadata. root: str, Directory from where to load the graph. seconds: int, Number of seconds to wait for after a successfull execution. length: int = 100, Length of the random walks. iterations: int = 1, Number of iterations to execute. """ validate_graph_and_library(library, graph_name, metadata_path) metadata = compress_json.load(metadata_path) if "disabled" in metadata: return data = metadata[graph_name] report = get_graph_report(data, root) walkers = libraries[library]["first_order_walk"] log_path = "results/{graph_name}/{library}/first_order_walk.csv".format( root=root, graph_name=graph_name, library=library) # If the library has already been tracked we skip it. # The same applies also when it is known that the graph cannot be handled with this library. if os.path.exists(log_path) or not can_load(root, walkers["load_graph"], graph_name): return graph = load_graph(walkers["load_graph"], data, root, report) with Tracker(log_path): walkers["walk"](graph, length=length, iterations=iterations) wait_k_seconds(seconds)
def test_compress_json(): D = random_string_dict(10, 10) key = sha256(D) extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys() for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.dump(D, path) assert key == sha256(compress_json.load(path)) shutil.rmtree("random_dirs") for ext in extensions: path = f"random_dirs/test.json.{ext}" compress_json.local_dump(D, path) assert key == sha256(compress_json.local_load(path)) shutil.rmtree("tests/random_dirs")
def get_data(self) -> Dict: """Returns metadata mined from the PheKnowLatorKG repository.""" json_url = "https://storage.googleapis.com/pheknowlator/pheknowlator_builds.json" downloader = BaseDownloader(verbose=2, cache=False) metadata_report = downloader.download(json_url) all_metadata = compress_json.load(metadata_report.iloc[0].destination) graph_name = "PheKnowLator" stored_graph_name = graph_name mined_data = { stored_graph_name: {} } for version, version_data in all_metadata.items(): if not isinstance(version_data, dict): continue for sub_version, url in version_data.items(): if url is None: continue full_version_code = "{version}.{sub_version}".format( version=version, sub_version=sub_version ) mined_data[stored_graph_name][full_version_code] = { "urls": [url], "paths": [ "edge_list.tsv" ], "arguments": { "edge_path": "edge_list.tsv", "name": graph_name, "sources_column": "subject", "destinations_column": "object", "edge_list_edge_types_column": "predicate", "node_list_is_correct": True, "edge_list_is_correct": True, } } return mined_data
def retrieve_graphs( informations_path: str, root: str = "graphs" ): """Retrieve graphs in given dataframe. Parameters --------------------- informations_path: str, Path from where to load the graph informations. root: str = "graphs", Position where to download graphs """ # First we load the graphs metadata graphs_data = compress_json.load(informations_path) # First we proceed to download the graphs in multi-processing # If there is a failure while downloading the graph, the library automatically # cleans afer itself and afterwards raises the exception. download_graphs(graphs_data, root) # Secondly we sanitize the downloaded files to remove elements such as: # - file headers (descriptors added in top of the files that include licenses etc...) # - duplicated edges (in some file there are duplicated edges) sanitize_graphs(graphs_data, root)
from typing import List, TextIO input_dir: str = "mini-json" output_dir: str = "decompressed-json" log_file: str = "failed-decompression.txt" files: List[str] = os.listdir(input_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) for file in files: print(f"Decompressing {file}") try: mini_json_file_path: str = os.path.join(input_dir, file) decompressed_file_path: str = os.path.join(output_dir, f"{file[:-5]}") input_dict: dict = compress_json.load(mini_json_file_path) compress_json.dump(input_dict, decompressed_file_path) except Exception as e: # JSONDecodeError print( f"Failed To Decompress {file}!!! Reason: {e}!!! Logging To {log_file}" ) with open(file=log_file, mode="a+") as f: f.writelines(f"{file}; {e}\n") f.close()
def retreive_temp_data(folder): path = folder+'\\temp.lzma' data = compress_json.load(path) os.remove(path) # deleting it as its only temporary return data
def get_graph_report(data: Dict, root: str) -> Dict: """Build path to edge file from given metadata.""" return compress_json.load( os.path.join(build_path_path(data, root), "report.json"))
def build_all(self): """Build graph retrieval methods.""" target_directory_path = os.path.join( "../bindings/python/ensmallen/datasets", self.repository_package_name, ) file_path = "{}.py".format(target_directory_path) imports = [] for graph_data_path in tqdm( glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), "graph_repositories", self.get_formatted_repository_name(), "*.json.gz" )), desc="Building graph retrieval methods for {}".format(self.name), leave=False ): graph_data = compress_json.load(graph_data_path) first_graph_version_data = list(graph_data.values())[0] graph_name = first_graph_version_data["graph_name"] packages_to_import = self.get_imports( graph_name, list(graph_data.keys())[-1]) if packages_to_import: imports.append(packages_to_import) imports = list(set(imports)) first_references = list(compress_json.load(glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), "graph_repositories", self.get_formatted_repository_name(), "*.json.gz" ))[0]).values())[0]["references"] has_unique_references = all( list(compress_json.load(path).values())[ 0]["references"] == first_references for path in glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), "graph_repositories", self.get_formatted_repository_name(), "*.json.gz" )) ) and first_references file = open(file_path, "w") file.write("\n".join([ "\"\"\"Module providing graphs available from {repository_name}.{references}\"\"\"".format( repository_name=self.get_formatted_repository_name(), references="\n\n{}\n".format(self.format_references( first_references)) if has_unique_references else "" ), "from ensmallen import Graph # pylint: disable=import-error", self.get_graph_retrieval_import(), *imports, "", "" ])) graph_repository_metadata = {} for graph_data_path in tqdm( glob(os.path.join( os.path.dirname(os.path.abspath(__file__)), "graph_repositories", self.get_formatted_repository_name(), "*.json.gz" )), desc="Building graph retrieval methods for {}".format(self.name), leave=False, dynamic_ncols=True ): graph_data = compress_json.load(graph_data_path) first_graph_version_data = list(graph_data.values())[0] graph_name = first_graph_version_data["graph_name"] graph_method_name = first_graph_version_data["graph_method_name"] graph_retrieval_file = self.format_graph_retrieval_file( graph_name=graph_name, graph_method_name=graph_method_name, references=first_graph_version_data["references"], versions=list(graph_data.keys()), has_unique_references=has_unique_references ) for value in graph_data.values(): value.pop("references") graph_repository_metadata[graph_method_name] = graph_data file.write(graph_retrieval_file) file.close() compress_json.dump( graph_repository_metadata, "{}.json.gz".format(target_directory_path) )
@app.route('/', methods=['GET', 'POST']) def home(): global result global count global search_word if request.method == 'POST': search_word = request.form["search bar"] result = search(search_word) count = 1 return redirect(url_for("result_found")) else: return render_template("search.html") if __name__ == '__main__': if not (os.path.isfile('comp_data.json.gz') and os.access('comp_data.json.gz', os.R_OK)): if not (os.path.isfile('data.txt') and os.access('data.txt', os.R_OK)): with open("data.txt", "w", encoding='utf8') as jsonfile: open_files_and_collect_data() collect_yap_data() json.dump(data, jsonfile, ensure_ascii=False) with open('data.txt', encoding='utf8') as data_json: data_load = json.load(data_json) compress_json.dump(data_load, "comp_data.json.gz") data = compress_json.load("comp_data.json.gz") app.run()
def get_graph_names(metadata_path: str) -> List[str]: """Return name of available graphs.""" return list(compress_json.load(metadata_path).keys())
def bench_second_order_walks(library: str, graph_name: str, metadata_path: str, root: str, seconds: int, length: int = 100, iterations: int = 1, p: float = 2.0, q: float = 2.0): """Benches executing random walks the given graph using given library. Parameters ----------------------- library: str, Library to use for the benchmark. graph_name: str, Graph to use for the benchmark. metadata_path: str, Path from where to load the graph metadata. root: str, Directory from where to load the graph. seconds: int, Number of seconds to wait for after a successfull execution. length: int = 100, Length of the random walks. iterations: int = 1, Number of iterations to execute. p: float = 1.0, Inverse of the return weight. q: float = 1.0, Invert of the explore weight. """ validate_graph_and_library(library, graph_name, metadata_path) metadata = compress_json.load(metadata_path) if "disabled" in metadata: return data = metadata[graph_name] report = get_graph_report(data, root) walkers = libraries[library]["second_order_walk"] if p == 2.0 and q == 1.0: task_name = "second_order_walk_only_p" elif q == 2.0 and p == 1.0: task_name = "second_order_walk_only_q" else: task_name = "second_order_walk" log_path = "results/{graph_name}/{library}/{task_name}.csv".format( root=root, graph_name=graph_name, library=library, task_name=task_name) # If the library has already been tracked we skip it. # The same applies also when it is known that the graph cannot be handled with this library. if os.path.exists(log_path) or not can_load(root, walkers["load_graph"], graph_name): return graph = load_graph(walkers["load_graph"], data, root, report, p=p, q=q) with Tracker(log_path): walkers["walk"]( graph, length=length, iterations=iterations, max_degree=int(report["max_degree"]), nodes_number=int(report["nodes_number"]), p=p, q=q, ) wait_k_seconds(seconds)