Beispiel #1
0
def getSplitProject(featureJob: signac.Project.Job):
    try:
        splitProject = signac.get_project(root=featureJob.workspace(),
                                          search=False)
    except LookupError:
        splitProject = signac.init_project("SyntheticExperimentSplits",
                                           root=featureJob.workspace(),
                                           workspace="splits")
    return splitProject
Beispiel #2
0
def getFeatureProject(graphJob: signac.Project.Job):
    try:
        featureProject = signac.get_project(root=graphJob.workspace(),
                                            search=False)
    except LookupError:
        featureProject = signac.init_project("SyntheticExperimentFeatures",
                                             root=graphJob.workspace(),
                                             workspace="features")
    return featureProject
Beispiel #3
0
def clear_workspace(job: signac.Project.Job):
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".
                  format(job.get_id(), exp_regex, feature_graph_name))
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(
                featureJob).find_jobs(task_args.split_filter,
                                      task_args.split_doc_filter):
            print("[run_model@{}] Filter {} not matching; skip on dataset {}".
                  format(job.get_id(),
                         (task_args.split_filter, task_args.split_doc_filter),
                         feature_graph_name))
            continue
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)
        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot  # type: Path
        if workspaceDirObj.exists():
            assert workspaceDirObj.is_dir()
            if task_args.model_args:
                try:
                    modelProject = signac.get_project(
                        root=str(workspaceDirObj), search=False)
                    md5_str = "_".join(
                        map(lambda x: calculate_md5(splitJob.fn(x)),
                            feature_graph_files))
                    for args in task_args.model_args:
                        if task_args.arg_regex is not None and re.search(
                                task_args.arg_regex, args) is None:
                            print(
                                "[run_model@{}] Regex {} not matching; skip on args {}"
                                .format(job.get_id(), task_args.arg_regex,
                                        args))
                            continue
                        run_id = "{}@{}".format(args, md5_str)
                        for model_job in modelProject.find_jobs(
                                filter={"run_id": run_id}):
                            print("Removing folder {}".format(
                                model_job.workspace()))
                            shutil.rmtree(model_job.workspace())
                except LookupError:
                    pass
            else:
                print("Removing folder {}".format(workspaceDirObj))
                shutil.rmtree(str(workspaceDirObj))
Beispiel #4
0
def feature_file_iter(job: signac.Project.Job):
    for featureJob in utils.signac_tools.feature_iter(job):
        try:
            feature_type = featureJob.sp.feature_type
        except:
            print(job.get_id(), featureJob.get_id())
            raise
        if feature_type == "naive":
            yield featureJob.fn("{}-{}-{}.allx".format(job.sp.graphName, feature_type, featureJob.sp.var_factor))
        elif feature_type == "naive_npz":
            yield featureJob.fn("{}-{}-{}.allx.npz".format(job.sp.graphName, feature_type, featureJob.sp.var_factor))
        elif feature_type == "sample":
            type_str = featureJob.sp.sample_type
            if type_str == "cora_row":
                yield featureJob.fn(f"{job.sp.graphName}-{feature_type}-{type_str}.allx.npz")
            elif type_str in ["ogbn"]:
                pass
            else:
                raise NotImplementedError(
                    f"{job.sp.graphName}-{feature_type}-{type_str}")
        elif feature_type in ["unmodified"]:
            if job.sp.method == "planetoid":
                pass
            elif job.sp.method in ["GeomGCN", "SparseGraph"]:
                yield featureJob.fn(f"{job.sp.graphName}-{feature_type}.allx.npz")
            else:
                raise NotImplementedError(
                    f"{job.sp.graphName}-{job.sp.method}-{feature_type}")
        else:
            raise ValueError("Unknown feature type {}".format(feature_type))
Beispiel #5
0
def reset_random_state(job: signac.Project.Job, seed=None):
    global random_state
    if seed is None:
        seed = job.get_id()
    np_seed = random.Random(seed).randrange(0, 2**32)
    random_state = np.random.RandomState(np_seed)
    print(f"[{job.get_id()}] Random seed is set to {np_seed}")
    return random_state
Beispiel #6
0
def graphsage_result_parser(job_m: signac.Project.Job, args):
    with open(job_m.fn("results.json"), "r") as json_in:
        exp_metrics = json.load(json_in)
    args.csv_data_dict["Train Epoch"] = exp_metrics["epoch"]
    args.csv_data_dict["Test Acc"] = exp_metrics["test_accuracy"]
    args.csv_data_dict["Val Acc"] = exp_metrics["val_acc"]
    args.csv_data_dict["Train Acc"] = exp_metrics["train_acc"]
    args.csv_data_dict["Train Loss"] = exp_metrics["train_loss"]
Beispiel #7
0
def calculate_statistics(job: signac.Project.Job):
    G = nx.read_gpickle(
        job.fn(job.sp.graphName + ".gpickle.gz"))  # type: nx.Graph
    ally = pickle.load(
        open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes")
    for stat_key, (stat_func, addToDoc, addToData) in graph_stats.stats_dict.items():
        if (addToData and (stat_key not in job.data)) or (addToDoc and (stat_key not in job.doc)) or (addToData > 1) or (addToDoc > 1):
            print(
                f"[calculate_statistics@{job.get_id()}] Calculating {stat_key}...")
            resultDict = stat_func(G=G, ally=ally, job=job)
            assert stat_key in resultDict
            for key, value in resultDict.items():
                addToDocResult = graph_stats.stats_dict[key][1]
                addToDataResult = graph_stats.stats_dict[key][2]
                if addToDocResult:
                    job.doc[key] = value
                if addToDataResult:
                    job.data[key] = value
Beispiel #8
0
def generate_csv(job: signac.Project.Job, args):
    textBuffer = StringIO()
    args.csv_data_dict["numClass"] = job.sp.numClass
    h = job.sp.get("h", None)
    if h is not None:
        args.csv_data_dict["h"] = "{:.2f}".format(h)
    else:
        args.csv_data_dict["h"] = job.sp.source_name
    args.csv_data_dict["numNodes"] = job.doc["numNodes"]
    args.csv_data_dict["numEdges"] = job.doc["numEdges"]
    args.csv_data_dict["Graph ID"] = job.get_id()
    args.csv_data_dict["method"] = job.sp.method
    args.csv_data_dict["Graph Name"] = job.sp.graphName
    args.csv_data_dict["Clustering Coefficient"] = job.doc[
        "avgClusteringCoeff"]
    args.csv_data_dict["Min Degree"] = job.doc["min_degree"]
    args.csv_data_dict["Max Degree"] = job.doc["max_degree"]
    args.csv_data_dict["Average Degree"] = job.doc["avg_degree"]
    args.csv_data_dict["AverageSPLength"] = job.doc["avgSPLength"]
    args.csv_data_dict["numTriangles"] = job.doc["numTotalTriangles"]
    args.csv_data_dict["homoEdgeRatio"] = job.doc["homoEdgeRatio"]

    G = nx.read_gpickle(job.fn(job.sp.graphName +
                               ".gpickle.gz"))  #type: nx.Graph
    args.csv_data_dict["numComponent"] = job.doc["numComponents"]
    componentSize = np.array(
        list(map(lambda S: len(S.nodes), nx.connected_component_subgraphs(G))))
    args.csv_data_dict["maxComponentSize"] = np.max(componentSize)
    args.csv_data_dict["meanComponentSize"] = np.mean(componentSize)

    assert len(args.csv_data_dict) == len(
        args.csv_header_list), args.csv_data_dict.keys()
    # Write to text buffer
    textBuffer.write(",".join(map(str, args.csv_data_dict.values())) + "\n")

    # Write to the result file
    if not args.csv_file_generated:
        with open(args.output, "w") as csv_out:
            csv_out.write(",".join(args.csv_header_list) + "\n")
            csv_out.write(textBuffer.getvalue())
            args.csv_file_generated = True
    else:
        with open(args.output, "a") as csv_out:
            csv_out.write(textBuffer.getvalue())
Beispiel #9
0
def clear_job(job: signac.Project.Job):
    workspaceDirObj = Path(job.workspace())
    for child in workspaceDirObj.iterdir():
        if child.name not in ["signac_statepoint.json", "signac_job_document.json"]:
            if child.is_dir():
                print(f"Deleting directory {child}")
                shutil.rmtree(str(child))
            else:
                print(f"Deleting {child}")
                child.unlink()
Beispiel #10
0
def mixhop_result_parser(job_m: signac.Project.Job, args):
    resultDirObj = Path(job_m.fn("results"))
    result_json_name = None
    for item in resultDirObj.iterdir():
        if item.match("*.json"):
            assert result_json_name is None
            result_json_name = str(item)
    with open(result_json_name, "r") as json_in:
        exp_metrics = json.load(json_in)
    args.csv_data_dict["Train Epoch"] = exp_metrics["at_best_validate"][2]
    args.csv_data_dict["Test Acc"] = exp_metrics["at_best_validate"][1]
    args.csv_data_dict["Val Acc"] = exp_metrics["at_best_validate"][0]
Beispiel #11
0
def clean_workspace(job: signac.Project.Job):
    for _, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".
                  format(job.get_id(), exp_regex, feature_graph_name))
            continue
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)
        if all(map(splitJob.isfile, feature_graph_files)):
            md5_str = "_".join(
                map(lambda x: calculate_md5(splitJob.fn(x)),
                    feature_graph_files))
        else:
            md5_str = None
            print(
                f"[clean_workspace@{job.get_id()}] Missing files for split {feature_graph_name}"
            )

        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot  # type: Path
        if workspaceDirObj.exists():
            assert workspaceDirObj.is_dir()
            try:
                modelProject = signac.get_project(root=str(workspaceDirObj),
                                                  search=False)
                for model_job in modelProject:
                    if not model_job.doc.get("succeeded", False):
                        target_dir = model_job.workspace()
                        print(
                            f"[clean_workspace@{job.get_id()}] Experiment not succeeded: removing folder {target_dir}"
                        )
                        shutil.rmtree(target_dir)
                    elif (md5_str is not None) and (
                            not model_job.sp.run_id.endswith(md5_str)):
                        target_dir = model_job.workspace()
                        print(
                            f"[clean_workspace@{job.get_id()}] Experiment not matching current data: removing folder {target_dir}"
                        )
                        shutil.rmtree(target_dir)
            except LookupError:
                pass
Beispiel #12
0
def generate_csv(job: signac.Project.Job, args):
    textBuffer = StringIO()
    textList = []
    args.csv_data_dict["numClass"] = job.sp.numClass
    try:
        args.csv_data_dict["h"] = "{:.2f}".format(job.sp.h)
    except AttributeError:
        args.csv_data_dict["h"] = job.sp.HName
    args.csv_data_dict["Graph ID"] = job.get_id()
    args.csv_data_dict["Clustering Coefficient"] = job.doc.get(
        "avgClusteringCoeff")
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(
            job):
        feature_file = featureJob.doc.get("feature_file")
        if featureJob.doc.get("feature_name"):
            args.csv_data_dict["Feature"] = featureJob.doc["feature_name"]
        else:
            args.csv_data_dict["Feature"] = Path(
                feature_file.replace(job.sp.graphName + "-", "")).stem
        args.csv_data_dict["Graph Name"] = feature_graph_name
        args.csv_data_dict["Split Config"] = splitJob.sp.split_config
        md5_str = "_".join(
            map(lambda x: calculate_md5(splitJob.fn(x)), feature_graph_files))
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)

        # Workspace path
        workspaceDirObj = datasetDirObj / args.workspaceRoot
        try:
            gcnProject = signac.get_project(root=str(workspaceDirObj),
                                            search=False)
        except LookupError as e:
            print(e, file=sys.stderr)
            continue

        if args.exp_args is not None:
            exp_arg_list = args.exp_args
        elif args.add_args:
            exp_arg_list = list(
                set(splitJob.doc.get(args.exp_type, default=[]))
                | set(args.add_args))
        else:
            exp_arg_list = splitJob.doc.get(args.exp_type, default=[])

        for exp_args in exp_arg_list:
            args.csv_data_dict["Model Args"] = '"{}"'.format(exp_args)
            run_id = "{}@{}".format(exp_args, md5_str)
            job_iter = gcnProject.find_jobs(filter={"run_id": run_id})
            if any(
                    map(lambda job_i: job_i.doc.get("succeeded", False),
                        job_iter)):
                assert len(job_iter) == 1, (args.csv_data_dict, run_id)
                # Parse experiment results
                for job_m in job_iter:
                    args.csv_data_dict["Experiment ID"] = job_m.get_id()
                    args.result_parser(job_m, args)
                    if args.path_only:
                        path = [
                            job.get_id(),
                            featureJob.get_id(),
                            splitJob.get_id(), "/", args.workspaceRoot,
                            job_m.get_id()
                        ]
                        args.csv_data_dict["Job Path"] = json.dumps(path)
                assert len(args.csv_data_dict) == len(args.csv_header_list)

                # Write to text buffer
                textBuffer.write(
                    ",".join(map(str, args.csv_data_dict.values())) + "\n")
                textList.append(list(map(str, args.csv_data_dict.values())))

    if not args.path_only:
        # Write to the result file
        if not args.csv_file_generated:
            print(f"CSV will be saved to {args.output}")
            with open(args.output, "w") as csv_out:
                csv_out.write(",".join(args.csv_header_list) + "\n")
                csv_out.write(textBuffer.getvalue())
                args.csv_file_generated = True
        else:
            with open(args.output, "a") as csv_out:
                csv_out.write(textBuffer.getvalue())
    else:
        # Write to the result file
        csv_writer = csv.writer(sys.stdout)
        if not args.csv_file_generated:
            csv_writer.writerow(args.csv_header_list)
            csv_writer.writerows(textList)
            args.csv_file_generated = True
        else:
            csv_writer.writerows(textList)
Beispiel #13
0
def generate_split(job: signac.Project.Job):
    graph = pickle.load(
        open(job.fn(job.sp.graphName + ".graph"), "rb"), encoding="bytes")
    ally = pickle.load(
        open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes")
    G = nx.read_gpickle(
        job.fn(job.sp.graphName + ".gpickle.gz"))  # type: nx.Graph
    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job):
        if splitJob.sp.get("split_index", None) is None:
            feature_generation.random_state = reset_random_state(
                job, (job.get_id(), feature_graph_name))
        else:
            feature_generation.random_state = reset_random_state(
                job, (splitJob.get_id(), feature_graph_name))
        if all(map(splitJob.isfile, feature_graph_files)):
            print("[generate_split@{}] Skipping {}".format(
                job.get_id(), feature_graph_name))
        else:
            print("[generate_split@{}] Generating split for {}".format(
                job.get_id(), feature_graph_name))
            if featureJob.doc.get("feature_file"):
                feature_file = featureJob.fn(featureJob.doc["feature_file"])
                if splitJob.sp.get("split_source"):
                    if splitJob.sp.source_format == "GeomGCN":
                        with splitJob:
                            with np.load(splitJob.sp.split_source) as splits_file:
                                train_mask = splits_file['train_mask']
                                val_mask = splits_file['val_mask']
                                test_mask = splits_file['test_mask']
                        train_indices = np.where(train_mask)[0]
                        val_indices = np.where(val_mask)[0]
                        test_indices = np.where(test_mask)[0]
                        feature_generation.generate_split(
                            job, graph, ally, G, feature_file,
                            splitJob, feature_graph_name, feature_graph_files,
                            train_indices=train_indices,
                            validation_indices=val_indices,
                            test_indices=test_indices)
                elif job.sp.method == "planetoid":
                    # Need to merge the code which copys the files (elif -> if)
                    raise NotImplementedError()
                else:
                    feature_generation.generate_split(
                        job, graph, ally, G, feature_file, splitJob, feature_graph_name, feature_graph_files)

            elif featureJob.sp.feature_type == "unmodified":
                if job.sp.method == "planetoid":
                    for source_file, dest_file in [
                        (job.fn(f"data_source/{job.sp.datasetName}.{ext}"),
                         splitJob.fn(f"{feature_graph_name}.{ext}"))
                        for ext in ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph', 'test.index']
                    ]:
                        shutil.copy2(source_file, dest_file)
                    assert all(map(splitJob.isfile, feature_graph_files))
                    splitJob.doc["succeeded"] = True
                    splitJob.doc["split_name"] = feature_graph_name
                else:
                    raise NotImplementedError()

            elif featureJob.sp.feature_type == "sample" and featureJob.sp.sample_type == "ogbn":
                feature_generation.ogbn_generate_split(
                    job, splitJob, feature_graph_name, feature_graph_files)
            else:
                raise ValueError()
Beispiel #14
0
def generate_graph(job: signac.Project.Job):
    print("Generating graph for job {}".format(job.get_id()))
    graphgen.random_state = reset_random_state(job)
    if job.sp.method == "mixhop":
        generator = graphgen.MixhopGraphGenerator(
            job.sp.classRatio, job.sp.heteroClsWeight, heteroWeightsExponent=job.sp.heteroWeightsExponent)
        G = generator(job.sp.numNode, job.sp.m, job.sp.m0, job.sp.h)
        generator.save_graph(G, job.workspace(), job.sp.graphName)
        generator.save_y(G, job.workspace(), job.sp.graphName)
        generator.save_nx_graph(G, job.workspace(), job.sp.graphName)

    elif job.sp.method == "planetoid":
        with job:
            dataset = utils.PlanetoidData(job.sp.datasetName, "data_source")
        G = dataset.getNXGraph()
        generator = graphgen.GraphGenerator(job.sp.numClass)
        generator.save_graph(G, job.workspace(), job.sp.graphName)
        generator.save_y(G, job.workspace(), job.sp.graphName)
        generator.save_nx_graph(G, job.workspace(), job.sp.graphName)

        featureProject = utils.signac_tools.getFeatureProject(job)
        featureJob = featureProject.open_job({
            "feature_type": "unmodified"
        }).init()

        splitProject = utils.signac_tools.getSplitProject(featureJob)
        trainSetSize = dataset.y_all[dataset.train_mask].sum(0)
        if len(np.unique(trainSetSize)) == 1:
            trainSetSize = "{}c".format(int(trainSetSize[0]))
        else:
            trainSetSize = int(dataset.train_mask.sum())
        splitJob = splitProject.open_job({
            "split_config": "{}__{}".format(trainSetSize, int(dataset.test_mask.sum()))
        }).init()
    elif job.sp.method == "GeomGCN":
        with job:
            dataset = utils.GeomGCNData(job.sp.datasetName, "data_source")
        G = dataset.getNXGraph()
        generator = graphgen.GraphGenerator(job.sp.numClass)
        generator.save_graph(G, job.workspace(), job.sp.graphName)
        generator.save_y(G, job.workspace(), job.sp.graphName)
        generator.save_nx_graph(G, job.workspace(), job.sp.graphName)

        featureProject = utils.signac_tools.getFeatureProject(job)
        featureJob = featureProject.open_job({
            "feature_type": "unmodified"
        }).init()

        output_name = f"{job.sp.graphName}-unmodified.allx.npz"
        allx = dataset.features
        allx = scipy.sparse.csr_matrix(allx)
        scipy.sparse.save_npz(featureJob.fn(output_name), allx)

        featureJob.doc["feature_file"] = output_name
        featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified"
        featureJob.doc["succeeded"] = True
    elif job.sp.method == "SparseGraph":
        with job:
            spgraph = sparsegraph.io.load_dataset(
                str(Path("data_source")/job.sp.datasetName))

        for command in job.sp.get("preprocess", []):
            exec(command)

        G = spgraph.getNXGraph()
        generator = graphgen.GraphGenerator(job.sp.numClass)
        generator.save_graph(G, job.workspace(), job.sp.graphName)
        generator.save_y(G, job.workspace(), job.sp.graphName)
        generator.save_nx_graph(G, job.workspace(), job.sp.graphName)

        featureProject = utils.signac_tools.getFeatureProject(job)
        featureJob = featureProject.open_job({
            "feature_type": "unmodified"
        }).init()

        if spgraph.attr_matrix is not None:
            # Generate features
            output_name = f"{job.sp.graphName}-unmodified.allx.npz"
            allx = spgraph.attr_matrix
            allx = scipy.sparse.csr_matrix(allx)
            scipy.sparse.save_npz(featureJob.fn(output_name), allx)

            featureJob.doc["feature_file"] = output_name
            featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified"
            featureJob.doc["succeeded"] = True

    elif job.sp.method == "copy":
        graph_path, ally_path, ty_path, test_index_path = map(
            lambda x: job.fn("source_graph/{}{}".format(job.sp.source_name, x)), (".graph", ".ally", ".ty", ".test.index"))
        graph = pickle.load(open(graph_path, "rb"))
        G = nx.from_dict_of_lists(graph)
        ally = np.load(ally_path, allow_pickle=True)
        ty = np.load(ty_path, allow_pickle=True)

        attrs = dict()
        for i in range(ally.shape[0]):
            color = np.nonzero(ally[i, :])[0] + 1
            assert len(color) == 1, print(i, color)
            color = color[0]
            attrs[i] = {"color": color}

        for i, line in enumerate(open(test_index_path, "r")):
            node_id = int(line.strip())
            color = np.nonzero(ty[i, :])[0] + 1
            assert len(color) == 1, print(i, color)
            color = color[0]
            attrs[node_id] = {"color": color}

        assert i == ty.shape[0] - 1
        assert len(attrs) == len(G.node)
        nx.set_node_attributes(G, attrs)

        generator = graphgen.GraphGenerator(job.sp.numClass)
        generator.save_graph(G, job.workspace(), job.sp.graphName)
        generator.save_y(G, job.workspace(), job.sp.graphName)
        generator.save_nx_graph(G, job.workspace(), job.sp.graphName)
    else:
        raise ValueError("Unknown generation method {}".format(job.sp.method))
Beispiel #15
0
def generate_feature(job: signac.Project.Job):
    for featureJob in utils.signac_tools.feature_iter(job):
        feature_type = featureJob.sp.feature_type
        if feature_type == "naive":
            type_str = featureJob.sp.var_factor
            output_name = "{}-{}-{}.allx".format(
                job.sp.graphName, feature_type, type_str)
            if featureJob.isfile(output_name):
                print("[generate_feature@{}] {} already exists. Skipping...".format(
                    job.get_id(), output_name))
                continue
            print("[generate_feature@{}] Generating features to {}".format(
                job.get_id(), output_name))
            ally = pickle.load(
                open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes")
            if type_str == "all":
                allx = ally
            else:
                raise NotImplementedError()
            np.save(open(featureJob.fn(output_name), "wb"), allx)

            featureJob.doc["feature_file"] = output_name
            featureJob.doc["feature_name"] = f"{feature_type}-{type_str}"
            featureJob.doc["succeeded"] = True

        elif feature_type == "naive_npz":
            type_str = featureJob.sp.var_factor
            output_name = "{}-{}-{}.allx.npz".format(
                job.sp.graphName, feature_type, type_str)
            if featureJob.isfile(output_name):
                print("[generate_feature@{}] {} already exists. Skipping...".format(
                    job.get_id(), output_name))
                continue
            print("[generate_feature@{}] Generating features to {}".format(
                job.get_id(), output_name))
            ally = pickle.load(
                open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes")
            if type_str == "all":
                allx = ally
            elif type_str == "identity":
                allx = np.eye(ally.shape[0])
            else:
                raise NotImplementedError()
            allx = scipy.sparse.csr_matrix(allx)
            scipy.sparse.save_npz(featureJob.fn(output_name), allx)

            featureJob.doc["feature_file"] = output_name
            featureJob.doc["feature_name"] = f"{feature_type}-{type_str}"
            featureJob.doc["succeeded"] = True

        elif feature_type == "sample":
            type_str = featureJob.sp.sample_type
            if type_str == "cora_row":
                output_name = "{}-{}-{}.allx.npz".format(
                    job.sp.graphName, feature_type, type_str)
                if featureJob.isfile(output_name):
                    print("[generate_feature@{}] {} already exists. Skipping...".format(
                        job.get_id(), output_name))
                    continue
                ally = pickle.load(
                    open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes")
                cora = utils.get_cora()
                classSize = np.sum(ally, axis=0)
                if cora.feature_sample_eligible(classSize):
                    print("[generate_feature@{}] Generating features to {} by row-based cora feature sampling".format(
                        job.get_id(), output_name))
                    feature_generation.random_state = reset_random_state(
                        job, (job.get_id(), output_name))
                    allx = feature_generation.row_sample(ally, cora)
                    allx = scipy.sparse.csr_matrix(allx)
                    scipy.sparse.save_npz(featureJob.fn(output_name), allx)

                    featureJob.doc["feature_file"] = output_name
                    featureJob.doc["succeeded"] = True
                else:
                    featureJob.doc["disabled"] = True
                    featureJob.doc["disable_reason"] = f"[generate_feature@{job.get_id()}] {job.sp.graphName} is ineligible for row-based cora feature sampling"
                    print(featureJob.doc["disable_reason"])
            elif type_str in ["ogbn"]:
                if not featureJob.doc["succeeded"]:
                    raise ValueError(
                        f"[generate_feature@{job.get_id()}] {type_str} feature is not marked as succeeded for job {featureJob.get_id()}")
            else:
                raise NotImplementedError()

        elif feature_type == "unmodified":
            if job.sp.method == "planetoid":

                # This block is incompatibale with what current structure shows.
                output_name = f"{job.sp.datasetName}-{feature_type}.allx.npz"
                with job:
                    dataset = utils.PlanetoidData(
                        job.sp.datasetName, "data_source")
                allx = dataset.features
                allx = scipy.sparse.csr_matrix(allx)
                scipy.sparse.save_npz(featureJob.fn(output_name), allx)

                featureJob.doc["feature_file"] = output_name
                ###

                featureJob.doc["feature_name"] = f"{job.sp.datasetName}-{feature_type}"
                featureJob.doc["succeeded"] = True

            elif job.sp.method == "GeomGCN":
                output_name = f"{job.sp.graphName}-{feature_type}.allx.npz"
                if featureJob.isfile(output_name) and featureJob.doc.get("succeeded", False):
                    print("[generate_feature@{}] {} already exists. Skipping...".format(
                        job.get_id(), output_name))
                    continue
                print("[generate_feature@{}] Write Geom-GCN features to {}".format(
                    job.get_id(), output_name))
                with job:
                    dataset = utils.GeomGCNData(
                        job.sp.datasetName, "data_source")
                allx = dataset.features
                allx = scipy.sparse.csr_matrix(allx)
                scipy.sparse.save_npz(featureJob.fn(output_name), allx)

                featureJob.doc["feature_file"] = output_name
                featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified"
                featureJob.doc["succeeded"] = True
            else:
                raise NotImplementedError(
                    f"{job.sp.graphName}-{job.sp.method}-{feature_type}")
        else:
            raise ValueError("Unknown feature type {}".format(feature_type))
Beispiel #16
0
def getModelProject(splitJob: signac.Project.Job, modelRoot: str):
    projectRoot = Path(splitJob.workspace()) / modelRoot
    modelProject = signac.get_project(root=str(projectRoot), search=False)
    return modelProject
Beispiel #17
0
def ogbn_generate_split(job: signac.Project.Job, splitJob: signac.Project.Job,
                        feature_graph_name, feature_graph_files):
    import constraint
    with utils.chdir(splitJob.sp.ogbn_path):
        from ogb.nodeproppred import NodePropPredDataset
        d_name = splitJob.sp.ogbn_name

        lock = ogbnLockDict.setdefault(splitJob.sp.ogbn_path, threading.Lock())
        if not os.path.exists("dataset"):  # In case dataset is not downloaded
            lock.acquire()
            ogbnDataset = NodePropPredDataset(name=d_name)
            lock.release()
        else:
            ogbnDataset = NodePropPredDataset(name=d_name)

        split_idx = ogbnDataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        graph, label = ogbnDataset[0]

    with job:
        splitJobSrc = utils.signac_tools.access_proj_job(
            job, splitJob.sp.feature_source, splitJob.sp.split_source)
        splitSrcName = splitJobSrc.doc["split_name"]
        # Copy not changing files
        for source_file, dest_file in [
            (splitJobSrc.fn(f"{splitSrcName}.{ext}"),
             splitJob.fn(f"{feature_graph_name}.{ext}"))
                for ext in ('y', 'ty', 'ally', 'graph', 'test.index')
        ]:
            shutil.copy2(source_file, dest_file)

        with splitJobSrc:
            datasetSrc = utils.PlanetoidData(splitJobSrc.doc.split_name,
                                             ".",
                                             val_size=None)

        ogbnLabelCount = np.zeros((3, ogbnDataset.num_classes))
        ogbnLabelCount[0, :] = (label[train_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[1, :] = (label[valid_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)
        ogbnLabelCount[2, :] = (label[test_idx] == np.arange(
            ogbnDataset.num_classes)).sum(0)

        srcLabelCount = np.zeros((3, job.sp.numClass))
        srcLabelCount[0, :] = datasetSrc.y_all[datasetSrc.train_mask, :].sum(0)
        srcLabelCount[1, :] = datasetSrc.y_all[datasetSrc.val_mask, :].sum(0)
        srcLabelCount[2, :] = datasetSrc.y_all[datasetSrc.test_mask, :].sum(0)

        problem = constraint.Problem()
        problem.addVariables(range(job.sp.numClass),
                             range(ogbnDataset.num_classes))
        problem.addConstraint(constraint.AllDifferentConstraint())
        for i in range(job.sp.numClass):
            problem.addConstraint(
                lambda x: np.all(ogbnLabelCount[:, x] >= srcLabelCount[:, i]),
                (i, ))
        solution = problem.getSolution()

        for srcClass, dstClass in solution.items():
            assert np.all(
                ogbnLabelCount[:, dstClass] >= srcLabelCount[:, srcClass])

        newFeatures = np.zeros(
            (datasetSrc.num_samples, graph["node_feat"].shape[1]))
        for scope, idx in (("train", train_idx), ("val", valid_idx),
                           ("test", test_idx)):
            scope_mask = getattr(datasetSrc, f"{scope}_mask")
            for srcClass, dstClass in solution.items():
                srcOpMask = np.logical_and(scope_mask,
                                           datasetSrc.labels == srcClass)
                dstSampleSet = list(
                    set(idx).intersection(np.where(label == dstClass)[0]))
                sampleInds = random_state.choice(dstSampleSet,
                                                 srcOpMask.sum(),
                                                 replace=False)
                newFeatures[srcOpMask, :] = graph["node_feat"][sampleInds, :]

        x_mask = datasetSrc.train_mask
        allx_mask = (datasetSrc.train_mask + datasetSrc.val_mask)
        test_mask = datasetSrc.test_mask

        x = newFeatures[x_mask]
        allx = newFeatures[allx_mask]
        tx = newFeatures[test_mask]

        # .x; .tx; .allx
        pickle.dump(scipy.sparse.csr_matrix(x),
                    open(splitJob.fn(f"{feature_graph_name}.x"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(allx),
                    open(splitJob.fn(f"{feature_graph_name}.allx"), "wb"))
        pickle.dump(scipy.sparse.csr_matrix(tx),
                    open(splitJob.fn(f"{feature_graph_name}.tx"), "wb"))

        assert all(map(splitJob.isfile, feature_graph_files))
        splitJob.doc["succeeded"] = True
        splitJob.doc["split_name"] = feature_graph_name
        splitJob.doc.val_size = splitJobSrc.doc.val_size
Beispiel #18
0
def run_model(job: signac.Project.Job):
    logger = logging.getLogger('run_model@{}'.format(job.get_id()))
    logger.setLevel(logging.DEBUG)
    logger.propagate = False

    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    chFormatter = logging.Formatter(
        '[{asctime} {name} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{')
    ch.setFormatter(chFormatter)
    logger.addHandler(ch)

    for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job):
        exp_regex = get_exp_regex(job)
        if re.search(exp_regex, feature_graph_name) is None:
            print("[run_model@{}] Regex {} not matching; skip on dataset {}".format(
                job.get_id(), exp_regex, feature_graph_name))
            continue
        elif splitJob not in utils.signac_tools.getSplitProject(featureJob).find_jobs(
                task_args.split_filter, task_args.split_doc_filter):
            print("[run_model@{}] Filter {} not matching; skip on dataset {}".format(
                job.get_id(),
                (task_args.split_filter, task_args.split_doc_filter),
                feature_graph_name))
            continue
        elif is_tuning() and (splitJob.sp.get("split_index", None) not in {None, 0}):
            print("[run_model@{}] Split index is not 0 for tuning; skip on dataset {}".format(
                job.get_id(), feature_graph_name))
            continue
        md5_str = "_".join(map(lambda x: calculate_md5(
            splitJob.fn(x)), feature_graph_files))
        dataset_dir = splitJob.workspace()
        datasetDirObj = Path(dataset_dir)

        # Workspace path
        workspaceDirObj = datasetDirObj / workspaceRoot
        workspaceDirObj.mkdir(exist_ok=True, parents=True)
        modelProject = signac.init_project(
            name=expProjectName, root=str(workspaceDirObj))

        fh = logging.FileHandler(
            str(workspaceDirObj / "terminal_output.log"), "a")
        fh.setLevel(logging.DEBUG)
        fhFormatter = logging.Formatter(
            '[{asctime} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{')
        fh.setFormatter(fhFormatter)
        logger.addHandler(fh)

        exp_args_list = task_args.model_args or splitJob.doc.get(
            expCode, default=[])
        if exp_args_list == [] and is_tuning():
            exp_args_list = [""]
        for args in exp_args_list:
            if task_args.arg_regex is not None and re.search(task_args.arg_regex, args) is None:
                print("[run_model@{}] Regex {} not matching; skip on args {}".format(
                    job.get_id(), task_args.arg_regex, args))
                continue
            run_id = "{}@{}".format(args, md5_str)
            if is_tuning():
                run_id += "[tuning]"
                logger.removeHandler(fh)
            if any(map(lambda job_i: job_i.doc.get("succeeded", False), modelProject.find_jobs(filter={"run_id": run_id}))):
                print("[run_model@{}] Already run; skip on dataset {} for parameter {}".format(
                    job.get_id(), feature_graph_name, args))
            else:
                # Construct arguments
                args_split = args.split()
                dataset_args = dataset_args_func(
                    dataset_dir=dataset_dir, feature_graph_name=feature_graph_name,
                    run_id=run_id, workspaceDirObj=workspaceDirObj, task_args=task_args,
                    featureJob=featureJob, args=args, args_split=args_split, splitJob=splitJob
                )
                if dataset_args is None:
                    raise ValueError(
                        "dataset_args_func is not properly configured.")
                elif dataset_args is False:
                    print("[run_model@{}] Skip on dataset {} for parameter {}".format(
                        job.get_id(), feature_graph_name, args))
                    continue
                arg_list = [get_python_path(), "-u", modelScript] + \
                    dataset_args + args_split

                # Run model code
                print("[run_model@{}] run on dataset {} for parameter {}".format(
                    job.get_id(), feature_graph_name, args))
                try:
                    logger.info(
                        "===============\n>>>>Executing command {}\n===============".format(arg_list))
                    if not(job.doc.get("exp_terminal", False) or flags.log_to_terminal):
                        ch.setLevel(logging.WARNING)
                        ch.setFormatter(chFormatter)
                    if task_args.interactive:
                        proc = subprocess.Popen(
                            arg_list, cwd=str(modelPathObj))
                    else:
                        proc = subprocess.Popen(arg_list, cwd=str(modelPathObj),
                                                stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='utf-8')
                    if proc.stdout is not None:
                        msgcount = 0
                        for line in iter(proc.stdout.readline, ''):
                            msgcount += 1
                            logger.info(line.strip())
                            if msgcount % 100 == 0:
                                logger.debug("running on dataset {} for parameter {}".format(
                                    feature_graph_name, args))
                                msgcount = 0
                    returncode = proc.wait()
                    if returncode != 0:
                        raise subprocess.CalledProcessError(
                            returncode, arg_list)
                    else:
                        logger.debug("Completed on dataset {} for parameter {}".format(
                            feature_graph_name, args))
                except subprocess.CalledProcessError:
                    logger.error("Check log at {}".format(
                        workspaceDirObj / "terminal_output.log"))
                    raise
                logger.info("===============")
                ch.setLevel(logging.INFO)

                # Tag job as succeeded (except when tuning)
                assert len(modelProject.find_jobs(
                    filter={"run_id": run_id})) == 1
                if not task_args.tuning:
                    for job_m in modelProject.find_jobs(filter={"run_id": run_id}):
                        job_m.doc["succeeded"] = True
                else:
                    print("[run_model@{}]Job will not be tagged successful in tuning mode.".format(
                        job.get_id()))
        logger.removeHandler(fh)