Esempio n. 1
0
def bench_load_graph(library: str, graph_name: str, metadata_path: str,
                     root: str, seconds: int):
    """Benches loading the given graph using given library.

    Parameters
    -----------------------
    library: str,
        Library to use for the benchmark.
    graph_name: str,
        Graph to use for the benchmark.
    metadata_path: str,
        Path from where to load the graph metadata.
    root: str,
        Directory from where to load the graph.
    seconds: int,
        Number of seconds to wait for after a successfull execution.
    """
    validate_graph_and_library(library, graph_name, metadata_path)
    metadata = compress_json.load(metadata_path)
    if "disabled" in metadata:
        return
    data = metadata[graph_name]
    report = get_graph_report(data, root)

    log_path = "results/{graph}/{library}/load_graph.csv".format(
        root=root, graph=graph_name, library=library)

    if os.path.exists(log_path):
        return

    with Tracker(log_path):
        load_graph(library, data, root, report)

    wait_k_seconds(seconds)
    def get_preprocessed_metadata(self) -> Dict:
        """Return the stored metadata.

        Returns
        --------------------------------
        Dictionary with the metadata.
        """
        return compress_json.load(
            self.get_preprocessed_graph_metadata_path()
        )
Esempio n. 3
0
def test_plot_small_history():
    plot_history(pd.read_json("tests/small_history.json"),
                 path="plots/small_history.png")
    plt.close()
    assert os.path.exists("plots/small_history.png")
    plot_history("tests/small_history.json")
    plt.close()
    plot_history(["tests/small_history.json", "tests/small_history.json"])
    plt.close()
    plot_history(compress_json.load("tests/small_history.json"))
    plt.close()
Esempio n. 4
0
def bench_first_order_walks(
    library: str,
    graph_name: str,
    metadata_path: str,
    root: str,
    seconds: int,
    length: int = 100,
    iterations: int = 1,
):
    """Benches executing random walks the given graph using given library.

    Parameters
    -----------------------
    library: str,
        Library to use for the benchmark.
    graph_name: str,
        Graph to use for the benchmark.
    metadata_path: str,
        Path from where to load the graph metadata.
    root: str,
        Directory from where to load the graph.
    seconds: int,
        Number of seconds to wait for after a successfull execution.
    length: int = 100,
        Length of the random walks.
    iterations: int = 1,
        Number of iterations to execute.
    """
    validate_graph_and_library(library, graph_name, metadata_path)
    metadata = compress_json.load(metadata_path)
    if "disabled" in metadata:
        return
    data = metadata[graph_name]
    report = get_graph_report(data, root)

    walkers = libraries[library]["first_order_walk"]

    log_path = "results/{graph_name}/{library}/first_order_walk.csv".format(
        root=root, graph_name=graph_name, library=library)

    # If the library has already been tracked we skip it.
    # The same applies also when it is known that the graph cannot be handled with this library.
    if os.path.exists(log_path) or not can_load(root, walkers["load_graph"],
                                                graph_name):
        return

    graph = load_graph(walkers["load_graph"], data, root, report)

    with Tracker(log_path):
        walkers["walk"](graph, length=length, iterations=iterations)
    wait_k_seconds(seconds)
Esempio n. 5
0
def test_compress_json():
    D = random_string_dict(10, 10)
    key = sha256(D)
    extensions = compress_json.compress_json._DEFAULT_EXTENSION_MAP.keys()
    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.dump(D, path)
        assert key == sha256(compress_json.load(path))

    shutil.rmtree("random_dirs")

    for ext in extensions:
        path = f"random_dirs/test.json.{ext}"
        compress_json.local_dump(D, path)
        assert key == sha256(compress_json.local_load(path))

    shutil.rmtree("tests/random_dirs")
Esempio n. 6
0
    def get_data(self) -> Dict:
        """Returns metadata mined from the PheKnowLatorKG repository."""
        json_url = "https://storage.googleapis.com/pheknowlator/pheknowlator_builds.json"

        downloader = BaseDownloader(verbose=2, cache=False)
        metadata_report = downloader.download(json_url)
        all_metadata = compress_json.load(metadata_report.iloc[0].destination)

        graph_name = "PheKnowLator"
        stored_graph_name = graph_name
        mined_data = {
            stored_graph_name: {}
        }

        for version, version_data in all_metadata.items():
            if not isinstance(version_data, dict):
                continue
            for sub_version, url in version_data.items():
                if url is None:
                    continue
                full_version_code = "{version}.{sub_version}".format(
                    version=version,
                    sub_version=sub_version
                )
                mined_data[stored_graph_name][full_version_code] = {
                    "urls": [url],
                    "paths": [
                        "edge_list.tsv"
                    ],
                    "arguments": {
                        "edge_path": "edge_list.tsv",
                        "name": graph_name,
                        "sources_column": "subject",
                        "destinations_column": "object",
                        "edge_list_edge_types_column": "predicate",
                        "node_list_is_correct": True,
                        "edge_list_is_correct": True,
                    }
                }

        return mined_data
Esempio n. 7
0
def retrieve_graphs(
    informations_path: str,
    root: str = "graphs"
):
    """Retrieve graphs in given dataframe.

    Parameters
    ---------------------
    informations_path: str,
        Path from where to load the graph informations.
    root: str = "graphs",
        Position where to download graphs
    """
    # First we load the graphs metadata
    graphs_data = compress_json.load(informations_path)
    # First we proceed to download the graphs in multi-processing
    # If there is a failure while downloading the graph, the library automatically
    # cleans afer itself and afterwards raises the exception.
    download_graphs(graphs_data, root)
    # Secondly we sanitize the downloaded files to remove elements such as:
    # - file headers (descriptors added in top of the files that include licenses etc...)
    # - duplicated edges (in some file there are duplicated edges)
    sanitize_graphs(graphs_data, root)
from typing import List, TextIO

input_dir: str = "mini-json"
output_dir: str = "decompressed-json"
log_file: str = "failed-decompression.txt"

files: List[str] = os.listdir(input_dir)

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

for file in files:
    print(f"Decompressing {file}")

    try:
        mini_json_file_path: str = os.path.join(input_dir, file)
        decompressed_file_path: str = os.path.join(output_dir, f"{file[:-5]}")

        input_dict: dict = compress_json.load(mini_json_file_path)

        compress_json.dump(input_dict, decompressed_file_path)
    except Exception as e:  # JSONDecodeError
        print(
            f"Failed To Decompress {file}!!! Reason: {e}!!! Logging To {log_file}"
        )

        with open(file=log_file, mode="a+") as f:
            f.writelines(f"{file}; {e}\n")
            f.close()
Esempio n. 9
0
def retreive_temp_data(folder):
    path = folder+'\\temp.lzma'
    data = compress_json.load(path)
    os.remove(path) # deleting it as its only temporary
    return data
Esempio n. 10
0
def get_graph_report(data: Dict, root: str) -> Dict:
    """Build path to edge file from given metadata."""
    return compress_json.load(
        os.path.join(build_path_path(data, root), "report.json"))
    def build_all(self):
        """Build graph retrieval methods."""
        target_directory_path = os.path.join(
            "../bindings/python/ensmallen/datasets",
            self.repository_package_name,
        )
        file_path = "{}.py".format(target_directory_path)

        imports = []

        for graph_data_path in tqdm(
            glob(os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "graph_repositories",
                self.get_formatted_repository_name(),
                "*.json.gz"
            )),
            desc="Building graph retrieval methods for {}".format(self.name),
            leave=False
        ):
            graph_data = compress_json.load(graph_data_path)
            first_graph_version_data = list(graph_data.values())[0]
            graph_name = first_graph_version_data["graph_name"]
            packages_to_import = self.get_imports(
                graph_name, list(graph_data.keys())[-1])
            if packages_to_import:
                imports.append(packages_to_import)

        imports = list(set(imports))

        first_references = list(compress_json.load(glob(os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "graph_repositories",
            self.get_formatted_repository_name(),
            "*.json.gz"
        ))[0]).values())[0]["references"]

        has_unique_references = all(
            list(compress_json.load(path).values())[
                0]["references"] == first_references
            for path in glob(os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "graph_repositories",
                self.get_formatted_repository_name(),
                "*.json.gz"
            ))
        ) and first_references

        file = open(file_path, "w")
        file.write("\n".join([
            "\"\"\"Module providing graphs available from {repository_name}.{references}\"\"\"".format(
                repository_name=self.get_formatted_repository_name(),
                references="\n\n{}\n".format(self.format_references(
                    first_references)) if has_unique_references else ""
            ),
            "from ensmallen import Graph  # pylint: disable=import-error",
            self.get_graph_retrieval_import(),
            *imports,
            "",
            ""
        ]))
        graph_repository_metadata = {}
        for graph_data_path in tqdm(
            glob(os.path.join(
                os.path.dirname(os.path.abspath(__file__)),
                "graph_repositories",
                self.get_formatted_repository_name(),
                "*.json.gz"
            )),
            desc="Building graph retrieval methods for {}".format(self.name),
            leave=False,
            dynamic_ncols=True
        ):
            graph_data = compress_json.load(graph_data_path)
            first_graph_version_data = list(graph_data.values())[0]
            graph_name = first_graph_version_data["graph_name"]
            graph_method_name = first_graph_version_data["graph_method_name"]
            graph_retrieval_file = self.format_graph_retrieval_file(
                graph_name=graph_name,
                graph_method_name=graph_method_name,
                references=first_graph_version_data["references"],
                versions=list(graph_data.keys()),
                has_unique_references=has_unique_references
            )
            for value in graph_data.values():
                value.pop("references")
            graph_repository_metadata[graph_method_name] = graph_data
            file.write(graph_retrieval_file)

        file.close()
        compress_json.dump(
            graph_repository_metadata,
            "{}.json.gz".format(target_directory_path)
        )
Esempio n. 12
0
@app.route('/', methods=['GET', 'POST'])
def home():
    global result
    global count
    global search_word
    if request.method == 'POST':
        search_word = request.form["search bar"]
        result = search(search_word)
        count = 1
        return redirect(url_for("result_found"))
    else:
        return render_template("search.html")


if __name__ == '__main__':
    if not (os.path.isfile('comp_data.json.gz')
            and os.access('comp_data.json.gz', os.R_OK)):
        if not (os.path.isfile('data.txt') and os.access('data.txt', os.R_OK)):
            with open("data.txt", "w", encoding='utf8') as jsonfile:
                open_files_and_collect_data()
                collect_yap_data()
                json.dump(data, jsonfile, ensure_ascii=False)

        with open('data.txt', encoding='utf8') as data_json:
            data_load = json.load(data_json)
            compress_json.dump(data_load, "comp_data.json.gz")

    data = compress_json.load("comp_data.json.gz")
    app.run()
Esempio n. 13
0
def get_graph_names(metadata_path: str) -> List[str]:
    """Return name of available graphs."""
    return list(compress_json.load(metadata_path).keys())
Esempio n. 14
0
def bench_second_order_walks(library: str,
                             graph_name: str,
                             metadata_path: str,
                             root: str,
                             seconds: int,
                             length: int = 100,
                             iterations: int = 1,
                             p: float = 2.0,
                             q: float = 2.0):
    """Benches executing random walks the given graph using given library.

    Parameters
    -----------------------
    library: str,
        Library to use for the benchmark.
    graph_name: str,
        Graph to use for the benchmark.
    metadata_path: str,
        Path from where to load the graph metadata.
    root: str,
        Directory from where to load the graph.
    seconds: int,
        Number of seconds to wait for after a successfull execution.
    length: int = 100,
        Length of the random walks.
    iterations: int = 1,
        Number of iterations to execute.
    p: float = 1.0,
        Inverse of the return weight.
    q: float = 1.0,
        Invert of the explore weight.
    """
    validate_graph_and_library(library, graph_name, metadata_path)
    metadata = compress_json.load(metadata_path)
    if "disabled" in metadata:
        return
    data = metadata[graph_name]
    report = get_graph_report(data, root)

    walkers = libraries[library]["second_order_walk"]

    if p == 2.0 and q == 1.0:
        task_name = "second_order_walk_only_p"
    elif q == 2.0 and p == 1.0:
        task_name = "second_order_walk_only_q"
    else:
        task_name = "second_order_walk"

    log_path = "results/{graph_name}/{library}/{task_name}.csv".format(
        root=root, graph_name=graph_name, library=library, task_name=task_name)

    # If the library has already been tracked we skip it.
    # The same applies also when it is known that the graph cannot be handled with this library.
    if os.path.exists(log_path) or not can_load(root, walkers["load_graph"],
                                                graph_name):
        return

    graph = load_graph(walkers["load_graph"], data, root, report, p=p, q=q)

    with Tracker(log_path):
        walkers["walk"](
            graph,
            length=length,
            iterations=iterations,
            max_degree=int(report["max_degree"]),
            nodes_number=int(report["nodes_number"]),
            p=p,
            q=q,
        )
    wait_k_seconds(seconds)