def getSplitProject(featureJob: signac.Project.Job): try: splitProject = signac.get_project(root=featureJob.workspace(), search=False) except LookupError: splitProject = signac.init_project("SyntheticExperimentSplits", root=featureJob.workspace(), workspace="splits") return splitProject
def getFeatureProject(graphJob: signac.Project.Job): try: featureProject = signac.get_project(root=graphJob.workspace(), search=False) except LookupError: featureProject = signac.init_project("SyntheticExperimentFeatures", root=graphJob.workspace(), workspace="features") return featureProject
def clear_workspace(job: signac.Project.Job): for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter( job): exp_regex = get_exp_regex(job) if re.search(exp_regex, feature_graph_name) is None: print("[run_model@{}] Regex {} not matching; skip on dataset {}". format(job.get_id(), exp_regex, feature_graph_name)) continue elif splitJob not in utils.signac_tools.getSplitProject( featureJob).find_jobs(task_args.split_filter, task_args.split_doc_filter): print("[run_model@{}] Filter {} not matching; skip on dataset {}". format(job.get_id(), (task_args.split_filter, task_args.split_doc_filter), feature_graph_name)) continue dataset_dir = splitJob.workspace() datasetDirObj = Path(dataset_dir) # Workspace path workspaceDirObj = datasetDirObj / workspaceRoot # type: Path if workspaceDirObj.exists(): assert workspaceDirObj.is_dir() if task_args.model_args: try: modelProject = signac.get_project( root=str(workspaceDirObj), search=False) md5_str = "_".join( map(lambda x: calculate_md5(splitJob.fn(x)), feature_graph_files)) for args in task_args.model_args: if task_args.arg_regex is not None and re.search( task_args.arg_regex, args) is None: print( "[run_model@{}] Regex {} not matching; skip on args {}" .format(job.get_id(), task_args.arg_regex, args)) continue run_id = "{}@{}".format(args, md5_str) for model_job in modelProject.find_jobs( filter={"run_id": run_id}): print("Removing folder {}".format( model_job.workspace())) shutil.rmtree(model_job.workspace()) except LookupError: pass else: print("Removing folder {}".format(workspaceDirObj)) shutil.rmtree(str(workspaceDirObj))
def feature_file_iter(job: signac.Project.Job): for featureJob in utils.signac_tools.feature_iter(job): try: feature_type = featureJob.sp.feature_type except: print(job.get_id(), featureJob.get_id()) raise if feature_type == "naive": yield featureJob.fn("{}-{}-{}.allx".format(job.sp.graphName, feature_type, featureJob.sp.var_factor)) elif feature_type == "naive_npz": yield featureJob.fn("{}-{}-{}.allx.npz".format(job.sp.graphName, feature_type, featureJob.sp.var_factor)) elif feature_type == "sample": type_str = featureJob.sp.sample_type if type_str == "cora_row": yield featureJob.fn(f"{job.sp.graphName}-{feature_type}-{type_str}.allx.npz") elif type_str in ["ogbn"]: pass else: raise NotImplementedError( f"{job.sp.graphName}-{feature_type}-{type_str}") elif feature_type in ["unmodified"]: if job.sp.method == "planetoid": pass elif job.sp.method in ["GeomGCN", "SparseGraph"]: yield featureJob.fn(f"{job.sp.graphName}-{feature_type}.allx.npz") else: raise NotImplementedError( f"{job.sp.graphName}-{job.sp.method}-{feature_type}") else: raise ValueError("Unknown feature type {}".format(feature_type))
def reset_random_state(job: signac.Project.Job, seed=None): global random_state if seed is None: seed = job.get_id() np_seed = random.Random(seed).randrange(0, 2**32) random_state = np.random.RandomState(np_seed) print(f"[{job.get_id()}] Random seed is set to {np_seed}") return random_state
def graphsage_result_parser(job_m: signac.Project.Job, args): with open(job_m.fn("results.json"), "r") as json_in: exp_metrics = json.load(json_in) args.csv_data_dict["Train Epoch"] = exp_metrics["epoch"] args.csv_data_dict["Test Acc"] = exp_metrics["test_accuracy"] args.csv_data_dict["Val Acc"] = exp_metrics["val_acc"] args.csv_data_dict["Train Acc"] = exp_metrics["train_acc"] args.csv_data_dict["Train Loss"] = exp_metrics["train_loss"]
def calculate_statistics(job: signac.Project.Job): G = nx.read_gpickle( job.fn(job.sp.graphName + ".gpickle.gz")) # type: nx.Graph ally = pickle.load( open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes") for stat_key, (stat_func, addToDoc, addToData) in graph_stats.stats_dict.items(): if (addToData and (stat_key not in job.data)) or (addToDoc and (stat_key not in job.doc)) or (addToData > 1) or (addToDoc > 1): print( f"[calculate_statistics@{job.get_id()}] Calculating {stat_key}...") resultDict = stat_func(G=G, ally=ally, job=job) assert stat_key in resultDict for key, value in resultDict.items(): addToDocResult = graph_stats.stats_dict[key][1] addToDataResult = graph_stats.stats_dict[key][2] if addToDocResult: job.doc[key] = value if addToDataResult: job.data[key] = value
def generate_csv(job: signac.Project.Job, args): textBuffer = StringIO() args.csv_data_dict["numClass"] = job.sp.numClass h = job.sp.get("h", None) if h is not None: args.csv_data_dict["h"] = "{:.2f}".format(h) else: args.csv_data_dict["h"] = job.sp.source_name args.csv_data_dict["numNodes"] = job.doc["numNodes"] args.csv_data_dict["numEdges"] = job.doc["numEdges"] args.csv_data_dict["Graph ID"] = job.get_id() args.csv_data_dict["method"] = job.sp.method args.csv_data_dict["Graph Name"] = job.sp.graphName args.csv_data_dict["Clustering Coefficient"] = job.doc[ "avgClusteringCoeff"] args.csv_data_dict["Min Degree"] = job.doc["min_degree"] args.csv_data_dict["Max Degree"] = job.doc["max_degree"] args.csv_data_dict["Average Degree"] = job.doc["avg_degree"] args.csv_data_dict["AverageSPLength"] = job.doc["avgSPLength"] args.csv_data_dict["numTriangles"] = job.doc["numTotalTriangles"] args.csv_data_dict["homoEdgeRatio"] = job.doc["homoEdgeRatio"] G = nx.read_gpickle(job.fn(job.sp.graphName + ".gpickle.gz")) #type: nx.Graph args.csv_data_dict["numComponent"] = job.doc["numComponents"] componentSize = np.array( list(map(lambda S: len(S.nodes), nx.connected_component_subgraphs(G)))) args.csv_data_dict["maxComponentSize"] = np.max(componentSize) args.csv_data_dict["meanComponentSize"] = np.mean(componentSize) assert len(args.csv_data_dict) == len( args.csv_header_list), args.csv_data_dict.keys() # Write to text buffer textBuffer.write(",".join(map(str, args.csv_data_dict.values())) + "\n") # Write to the result file if not args.csv_file_generated: with open(args.output, "w") as csv_out: csv_out.write(",".join(args.csv_header_list) + "\n") csv_out.write(textBuffer.getvalue()) args.csv_file_generated = True else: with open(args.output, "a") as csv_out: csv_out.write(textBuffer.getvalue())
def clear_job(job: signac.Project.Job): workspaceDirObj = Path(job.workspace()) for child in workspaceDirObj.iterdir(): if child.name not in ["signac_statepoint.json", "signac_job_document.json"]: if child.is_dir(): print(f"Deleting directory {child}") shutil.rmtree(str(child)) else: print(f"Deleting {child}") child.unlink()
def mixhop_result_parser(job_m: signac.Project.Job, args): resultDirObj = Path(job_m.fn("results")) result_json_name = None for item in resultDirObj.iterdir(): if item.match("*.json"): assert result_json_name is None result_json_name = str(item) with open(result_json_name, "r") as json_in: exp_metrics = json.load(json_in) args.csv_data_dict["Train Epoch"] = exp_metrics["at_best_validate"][2] args.csv_data_dict["Test Acc"] = exp_metrics["at_best_validate"][1] args.csv_data_dict["Val Acc"] = exp_metrics["at_best_validate"][0]
def clean_workspace(job: signac.Project.Job): for _, splitJob, feature_graph_name, feature_graph_files in feature_split_iter( job): exp_regex = get_exp_regex(job) if re.search(exp_regex, feature_graph_name) is None: print("[run_model@{}] Regex {} not matching; skip on dataset {}". format(job.get_id(), exp_regex, feature_graph_name)) continue dataset_dir = splitJob.workspace() datasetDirObj = Path(dataset_dir) if all(map(splitJob.isfile, feature_graph_files)): md5_str = "_".join( map(lambda x: calculate_md5(splitJob.fn(x)), feature_graph_files)) else: md5_str = None print( f"[clean_workspace@{job.get_id()}] Missing files for split {feature_graph_name}" ) # Workspace path workspaceDirObj = datasetDirObj / workspaceRoot # type: Path if workspaceDirObj.exists(): assert workspaceDirObj.is_dir() try: modelProject = signac.get_project(root=str(workspaceDirObj), search=False) for model_job in modelProject: if not model_job.doc.get("succeeded", False): target_dir = model_job.workspace() print( f"[clean_workspace@{job.get_id()}] Experiment not succeeded: removing folder {target_dir}" ) shutil.rmtree(target_dir) elif (md5_str is not None) and ( not model_job.sp.run_id.endswith(md5_str)): target_dir = model_job.workspace() print( f"[clean_workspace@{job.get_id()}] Experiment not matching current data: removing folder {target_dir}" ) shutil.rmtree(target_dir) except LookupError: pass
def generate_csv(job: signac.Project.Job, args): textBuffer = StringIO() textList = [] args.csv_data_dict["numClass"] = job.sp.numClass try: args.csv_data_dict["h"] = "{:.2f}".format(job.sp.h) except AttributeError: args.csv_data_dict["h"] = job.sp.HName args.csv_data_dict["Graph ID"] = job.get_id() args.csv_data_dict["Clustering Coefficient"] = job.doc.get( "avgClusteringCoeff") for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter( job): feature_file = featureJob.doc.get("feature_file") if featureJob.doc.get("feature_name"): args.csv_data_dict["Feature"] = featureJob.doc["feature_name"] else: args.csv_data_dict["Feature"] = Path( feature_file.replace(job.sp.graphName + "-", "")).stem args.csv_data_dict["Graph Name"] = feature_graph_name args.csv_data_dict["Split Config"] = splitJob.sp.split_config md5_str = "_".join( map(lambda x: calculate_md5(splitJob.fn(x)), feature_graph_files)) dataset_dir = splitJob.workspace() datasetDirObj = Path(dataset_dir) # Workspace path workspaceDirObj = datasetDirObj / args.workspaceRoot try: gcnProject = signac.get_project(root=str(workspaceDirObj), search=False) except LookupError as e: print(e, file=sys.stderr) continue if args.exp_args is not None: exp_arg_list = args.exp_args elif args.add_args: exp_arg_list = list( set(splitJob.doc.get(args.exp_type, default=[])) | set(args.add_args)) else: exp_arg_list = splitJob.doc.get(args.exp_type, default=[]) for exp_args in exp_arg_list: args.csv_data_dict["Model Args"] = '"{}"'.format(exp_args) run_id = "{}@{}".format(exp_args, md5_str) job_iter = gcnProject.find_jobs(filter={"run_id": run_id}) if any( map(lambda job_i: job_i.doc.get("succeeded", False), job_iter)): assert len(job_iter) == 1, (args.csv_data_dict, run_id) # Parse experiment results for job_m in job_iter: args.csv_data_dict["Experiment ID"] = job_m.get_id() args.result_parser(job_m, args) if args.path_only: path = [ job.get_id(), featureJob.get_id(), splitJob.get_id(), "/", args.workspaceRoot, job_m.get_id() ] args.csv_data_dict["Job Path"] = json.dumps(path) assert len(args.csv_data_dict) == len(args.csv_header_list) # Write to text buffer textBuffer.write( ",".join(map(str, args.csv_data_dict.values())) + "\n") textList.append(list(map(str, args.csv_data_dict.values()))) if not args.path_only: # Write to the result file if not args.csv_file_generated: print(f"CSV will be saved to {args.output}") with open(args.output, "w") as csv_out: csv_out.write(",".join(args.csv_header_list) + "\n") csv_out.write(textBuffer.getvalue()) args.csv_file_generated = True else: with open(args.output, "a") as csv_out: csv_out.write(textBuffer.getvalue()) else: # Write to the result file csv_writer = csv.writer(sys.stdout) if not args.csv_file_generated: csv_writer.writerow(args.csv_header_list) csv_writer.writerows(textList) args.csv_file_generated = True else: csv_writer.writerows(textList)
def generate_split(job: signac.Project.Job): graph = pickle.load( open(job.fn(job.sp.graphName + ".graph"), "rb"), encoding="bytes") ally = pickle.load( open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes") G = nx.read_gpickle( job.fn(job.sp.graphName + ".gpickle.gz")) # type: nx.Graph for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job): if splitJob.sp.get("split_index", None) is None: feature_generation.random_state = reset_random_state( job, (job.get_id(), feature_graph_name)) else: feature_generation.random_state = reset_random_state( job, (splitJob.get_id(), feature_graph_name)) if all(map(splitJob.isfile, feature_graph_files)): print("[generate_split@{}] Skipping {}".format( job.get_id(), feature_graph_name)) else: print("[generate_split@{}] Generating split for {}".format( job.get_id(), feature_graph_name)) if featureJob.doc.get("feature_file"): feature_file = featureJob.fn(featureJob.doc["feature_file"]) if splitJob.sp.get("split_source"): if splitJob.sp.source_format == "GeomGCN": with splitJob: with np.load(splitJob.sp.split_source) as splits_file: train_mask = splits_file['train_mask'] val_mask = splits_file['val_mask'] test_mask = splits_file['test_mask'] train_indices = np.where(train_mask)[0] val_indices = np.where(val_mask)[0] test_indices = np.where(test_mask)[0] feature_generation.generate_split( job, graph, ally, G, feature_file, splitJob, feature_graph_name, feature_graph_files, train_indices=train_indices, validation_indices=val_indices, test_indices=test_indices) elif job.sp.method == "planetoid": # Need to merge the code which copys the files (elif -> if) raise NotImplementedError() else: feature_generation.generate_split( job, graph, ally, G, feature_file, splitJob, feature_graph_name, feature_graph_files) elif featureJob.sp.feature_type == "unmodified": if job.sp.method == "planetoid": for source_file, dest_file in [ (job.fn(f"data_source/{job.sp.datasetName}.{ext}"), splitJob.fn(f"{feature_graph_name}.{ext}")) for ext in ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph', 'test.index'] ]: shutil.copy2(source_file, dest_file) assert all(map(splitJob.isfile, feature_graph_files)) splitJob.doc["succeeded"] = True splitJob.doc["split_name"] = feature_graph_name else: raise NotImplementedError() elif featureJob.sp.feature_type == "sample" and featureJob.sp.sample_type == "ogbn": feature_generation.ogbn_generate_split( job, splitJob, feature_graph_name, feature_graph_files) else: raise ValueError()
def generate_graph(job: signac.Project.Job): print("Generating graph for job {}".format(job.get_id())) graphgen.random_state = reset_random_state(job) if job.sp.method == "mixhop": generator = graphgen.MixhopGraphGenerator( job.sp.classRatio, job.sp.heteroClsWeight, heteroWeightsExponent=job.sp.heteroWeightsExponent) G = generator(job.sp.numNode, job.sp.m, job.sp.m0, job.sp.h) generator.save_graph(G, job.workspace(), job.sp.graphName) generator.save_y(G, job.workspace(), job.sp.graphName) generator.save_nx_graph(G, job.workspace(), job.sp.graphName) elif job.sp.method == "planetoid": with job: dataset = utils.PlanetoidData(job.sp.datasetName, "data_source") G = dataset.getNXGraph() generator = graphgen.GraphGenerator(job.sp.numClass) generator.save_graph(G, job.workspace(), job.sp.graphName) generator.save_y(G, job.workspace(), job.sp.graphName) generator.save_nx_graph(G, job.workspace(), job.sp.graphName) featureProject = utils.signac_tools.getFeatureProject(job) featureJob = featureProject.open_job({ "feature_type": "unmodified" }).init() splitProject = utils.signac_tools.getSplitProject(featureJob) trainSetSize = dataset.y_all[dataset.train_mask].sum(0) if len(np.unique(trainSetSize)) == 1: trainSetSize = "{}c".format(int(trainSetSize[0])) else: trainSetSize = int(dataset.train_mask.sum()) splitJob = splitProject.open_job({ "split_config": "{}__{}".format(trainSetSize, int(dataset.test_mask.sum())) }).init() elif job.sp.method == "GeomGCN": with job: dataset = utils.GeomGCNData(job.sp.datasetName, "data_source") G = dataset.getNXGraph() generator = graphgen.GraphGenerator(job.sp.numClass) generator.save_graph(G, job.workspace(), job.sp.graphName) generator.save_y(G, job.workspace(), job.sp.graphName) generator.save_nx_graph(G, job.workspace(), job.sp.graphName) featureProject = utils.signac_tools.getFeatureProject(job) featureJob = featureProject.open_job({ "feature_type": "unmodified" }).init() output_name = f"{job.sp.graphName}-unmodified.allx.npz" allx = dataset.features allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified" featureJob.doc["succeeded"] = True elif job.sp.method == "SparseGraph": with job: spgraph = sparsegraph.io.load_dataset( str(Path("data_source")/job.sp.datasetName)) for command in job.sp.get("preprocess", []): exec(command) G = spgraph.getNXGraph() generator = graphgen.GraphGenerator(job.sp.numClass) generator.save_graph(G, job.workspace(), job.sp.graphName) generator.save_y(G, job.workspace(), job.sp.graphName) generator.save_nx_graph(G, job.workspace(), job.sp.graphName) featureProject = utils.signac_tools.getFeatureProject(job) featureJob = featureProject.open_job({ "feature_type": "unmodified" }).init() if spgraph.attr_matrix is not None: # Generate features output_name = f"{job.sp.graphName}-unmodified.allx.npz" allx = spgraph.attr_matrix allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified" featureJob.doc["succeeded"] = True elif job.sp.method == "copy": graph_path, ally_path, ty_path, test_index_path = map( lambda x: job.fn("source_graph/{}{}".format(job.sp.source_name, x)), (".graph", ".ally", ".ty", ".test.index")) graph = pickle.load(open(graph_path, "rb")) G = nx.from_dict_of_lists(graph) ally = np.load(ally_path, allow_pickle=True) ty = np.load(ty_path, allow_pickle=True) attrs = dict() for i in range(ally.shape[0]): color = np.nonzero(ally[i, :])[0] + 1 assert len(color) == 1, print(i, color) color = color[0] attrs[i] = {"color": color} for i, line in enumerate(open(test_index_path, "r")): node_id = int(line.strip()) color = np.nonzero(ty[i, :])[0] + 1 assert len(color) == 1, print(i, color) color = color[0] attrs[node_id] = {"color": color} assert i == ty.shape[0] - 1 assert len(attrs) == len(G.node) nx.set_node_attributes(G, attrs) generator = graphgen.GraphGenerator(job.sp.numClass) generator.save_graph(G, job.workspace(), job.sp.graphName) generator.save_y(G, job.workspace(), job.sp.graphName) generator.save_nx_graph(G, job.workspace(), job.sp.graphName) else: raise ValueError("Unknown generation method {}".format(job.sp.method))
def generate_feature(job: signac.Project.Job): for featureJob in utils.signac_tools.feature_iter(job): feature_type = featureJob.sp.feature_type if feature_type == "naive": type_str = featureJob.sp.var_factor output_name = "{}-{}-{}.allx".format( job.sp.graphName, feature_type, type_str) if featureJob.isfile(output_name): print("[generate_feature@{}] {} already exists. Skipping...".format( job.get_id(), output_name)) continue print("[generate_feature@{}] Generating features to {}".format( job.get_id(), output_name)) ally = pickle.load( open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes") if type_str == "all": allx = ally else: raise NotImplementedError() np.save(open(featureJob.fn(output_name), "wb"), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["feature_name"] = f"{feature_type}-{type_str}" featureJob.doc["succeeded"] = True elif feature_type == "naive_npz": type_str = featureJob.sp.var_factor output_name = "{}-{}-{}.allx.npz".format( job.sp.graphName, feature_type, type_str) if featureJob.isfile(output_name): print("[generate_feature@{}] {} already exists. Skipping...".format( job.get_id(), output_name)) continue print("[generate_feature@{}] Generating features to {}".format( job.get_id(), output_name)) ally = pickle.load( open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes") if type_str == "all": allx = ally elif type_str == "identity": allx = np.eye(ally.shape[0]) else: raise NotImplementedError() allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["feature_name"] = f"{feature_type}-{type_str}" featureJob.doc["succeeded"] = True elif feature_type == "sample": type_str = featureJob.sp.sample_type if type_str == "cora_row": output_name = "{}-{}-{}.allx.npz".format( job.sp.graphName, feature_type, type_str) if featureJob.isfile(output_name): print("[generate_feature@{}] {} already exists. Skipping...".format( job.get_id(), output_name)) continue ally = pickle.load( open(job.fn(job.sp.graphName + ".ally"), "rb"), encoding="bytes") cora = utils.get_cora() classSize = np.sum(ally, axis=0) if cora.feature_sample_eligible(classSize): print("[generate_feature@{}] Generating features to {} by row-based cora feature sampling".format( job.get_id(), output_name)) feature_generation.random_state = reset_random_state( job, (job.get_id(), output_name)) allx = feature_generation.row_sample(ally, cora) allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["succeeded"] = True else: featureJob.doc["disabled"] = True featureJob.doc["disable_reason"] = f"[generate_feature@{job.get_id()}] {job.sp.graphName} is ineligible for row-based cora feature sampling" print(featureJob.doc["disable_reason"]) elif type_str in ["ogbn"]: if not featureJob.doc["succeeded"]: raise ValueError( f"[generate_feature@{job.get_id()}] {type_str} feature is not marked as succeeded for job {featureJob.get_id()}") else: raise NotImplementedError() elif feature_type == "unmodified": if job.sp.method == "planetoid": # This block is incompatibale with what current structure shows. output_name = f"{job.sp.datasetName}-{feature_type}.allx.npz" with job: dataset = utils.PlanetoidData( job.sp.datasetName, "data_source") allx = dataset.features allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name ### featureJob.doc["feature_name"] = f"{job.sp.datasetName}-{feature_type}" featureJob.doc["succeeded"] = True elif job.sp.method == "GeomGCN": output_name = f"{job.sp.graphName}-{feature_type}.allx.npz" if featureJob.isfile(output_name) and featureJob.doc.get("succeeded", False): print("[generate_feature@{}] {} already exists. Skipping...".format( job.get_id(), output_name)) continue print("[generate_feature@{}] Write Geom-GCN features to {}".format( job.get_id(), output_name)) with job: dataset = utils.GeomGCNData( job.sp.datasetName, "data_source") allx = dataset.features allx = scipy.sparse.csr_matrix(allx) scipy.sparse.save_npz(featureJob.fn(output_name), allx) featureJob.doc["feature_file"] = output_name featureJob.doc["feature_name"] = f"{job.sp.datasetName}-unmodified" featureJob.doc["succeeded"] = True else: raise NotImplementedError( f"{job.sp.graphName}-{job.sp.method}-{feature_type}") else: raise ValueError("Unknown feature type {}".format(feature_type))
def getModelProject(splitJob: signac.Project.Job, modelRoot: str): projectRoot = Path(splitJob.workspace()) / modelRoot modelProject = signac.get_project(root=str(projectRoot), search=False) return modelProject
def ogbn_generate_split(job: signac.Project.Job, splitJob: signac.Project.Job, feature_graph_name, feature_graph_files): import constraint with utils.chdir(splitJob.sp.ogbn_path): from ogb.nodeproppred import NodePropPredDataset d_name = splitJob.sp.ogbn_name lock = ogbnLockDict.setdefault(splitJob.sp.ogbn_path, threading.Lock()) if not os.path.exists("dataset"): # In case dataset is not downloaded lock.acquire() ogbnDataset = NodePropPredDataset(name=d_name) lock.release() else: ogbnDataset = NodePropPredDataset(name=d_name) split_idx = ogbnDataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] graph, label = ogbnDataset[0] with job: splitJobSrc = utils.signac_tools.access_proj_job( job, splitJob.sp.feature_source, splitJob.sp.split_source) splitSrcName = splitJobSrc.doc["split_name"] # Copy not changing files for source_file, dest_file in [ (splitJobSrc.fn(f"{splitSrcName}.{ext}"), splitJob.fn(f"{feature_graph_name}.{ext}")) for ext in ('y', 'ty', 'ally', 'graph', 'test.index') ]: shutil.copy2(source_file, dest_file) with splitJobSrc: datasetSrc = utils.PlanetoidData(splitJobSrc.doc.split_name, ".", val_size=None) ogbnLabelCount = np.zeros((3, ogbnDataset.num_classes)) ogbnLabelCount[0, :] = (label[train_idx] == np.arange( ogbnDataset.num_classes)).sum(0) ogbnLabelCount[1, :] = (label[valid_idx] == np.arange( ogbnDataset.num_classes)).sum(0) ogbnLabelCount[2, :] = (label[test_idx] == np.arange( ogbnDataset.num_classes)).sum(0) srcLabelCount = np.zeros((3, job.sp.numClass)) srcLabelCount[0, :] = datasetSrc.y_all[datasetSrc.train_mask, :].sum(0) srcLabelCount[1, :] = datasetSrc.y_all[datasetSrc.val_mask, :].sum(0) srcLabelCount[2, :] = datasetSrc.y_all[datasetSrc.test_mask, :].sum(0) problem = constraint.Problem() problem.addVariables(range(job.sp.numClass), range(ogbnDataset.num_classes)) problem.addConstraint(constraint.AllDifferentConstraint()) for i in range(job.sp.numClass): problem.addConstraint( lambda x: np.all(ogbnLabelCount[:, x] >= srcLabelCount[:, i]), (i, )) solution = problem.getSolution() for srcClass, dstClass in solution.items(): assert np.all( ogbnLabelCount[:, dstClass] >= srcLabelCount[:, srcClass]) newFeatures = np.zeros( (datasetSrc.num_samples, graph["node_feat"].shape[1])) for scope, idx in (("train", train_idx), ("val", valid_idx), ("test", test_idx)): scope_mask = getattr(datasetSrc, f"{scope}_mask") for srcClass, dstClass in solution.items(): srcOpMask = np.logical_and(scope_mask, datasetSrc.labels == srcClass) dstSampleSet = list( set(idx).intersection(np.where(label == dstClass)[0])) sampleInds = random_state.choice(dstSampleSet, srcOpMask.sum(), replace=False) newFeatures[srcOpMask, :] = graph["node_feat"][sampleInds, :] x_mask = datasetSrc.train_mask allx_mask = (datasetSrc.train_mask + datasetSrc.val_mask) test_mask = datasetSrc.test_mask x = newFeatures[x_mask] allx = newFeatures[allx_mask] tx = newFeatures[test_mask] # .x; .tx; .allx pickle.dump(scipy.sparse.csr_matrix(x), open(splitJob.fn(f"{feature_graph_name}.x"), "wb")) pickle.dump(scipy.sparse.csr_matrix(allx), open(splitJob.fn(f"{feature_graph_name}.allx"), "wb")) pickle.dump(scipy.sparse.csr_matrix(tx), open(splitJob.fn(f"{feature_graph_name}.tx"), "wb")) assert all(map(splitJob.isfile, feature_graph_files)) splitJob.doc["succeeded"] = True splitJob.doc["split_name"] = feature_graph_name splitJob.doc.val_size = splitJobSrc.doc.val_size
def run_model(job: signac.Project.Job): logger = logging.getLogger('run_model@{}'.format(job.get_id())) logger.setLevel(logging.DEBUG) logger.propagate = False ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) chFormatter = logging.Formatter( '[{asctime} {name} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{') ch.setFormatter(chFormatter) logger.addHandler(ch) for featureJob, splitJob, feature_graph_name, feature_graph_files in feature_split_iter(job): exp_regex = get_exp_regex(job) if re.search(exp_regex, feature_graph_name) is None: print("[run_model@{}] Regex {} not matching; skip on dataset {}".format( job.get_id(), exp_regex, feature_graph_name)) continue elif splitJob not in utils.signac_tools.getSplitProject(featureJob).find_jobs( task_args.split_filter, task_args.split_doc_filter): print("[run_model@{}] Filter {} not matching; skip on dataset {}".format( job.get_id(), (task_args.split_filter, task_args.split_doc_filter), feature_graph_name)) continue elif is_tuning() and (splitJob.sp.get("split_index", None) not in {None, 0}): print("[run_model@{}] Split index is not 0 for tuning; skip on dataset {}".format( job.get_id(), feature_graph_name)) continue md5_str = "_".join(map(lambda x: calculate_md5( splitJob.fn(x)), feature_graph_files)) dataset_dir = splitJob.workspace() datasetDirObj = Path(dataset_dir) # Workspace path workspaceDirObj = datasetDirObj / workspaceRoot workspaceDirObj.mkdir(exist_ok=True, parents=True) modelProject = signac.init_project( name=expProjectName, root=str(workspaceDirObj)) fh = logging.FileHandler( str(workspaceDirObj / "terminal_output.log"), "a") fh.setLevel(logging.DEBUG) fhFormatter = logging.Formatter( '[{asctime} {levelname:>8}] {message}', '%m-%d %H:%M:%S', '{') fh.setFormatter(fhFormatter) logger.addHandler(fh) exp_args_list = task_args.model_args or splitJob.doc.get( expCode, default=[]) if exp_args_list == [] and is_tuning(): exp_args_list = [""] for args in exp_args_list: if task_args.arg_regex is not None and re.search(task_args.arg_regex, args) is None: print("[run_model@{}] Regex {} not matching; skip on args {}".format( job.get_id(), task_args.arg_regex, args)) continue run_id = "{}@{}".format(args, md5_str) if is_tuning(): run_id += "[tuning]" logger.removeHandler(fh) if any(map(lambda job_i: job_i.doc.get("succeeded", False), modelProject.find_jobs(filter={"run_id": run_id}))): print("[run_model@{}] Already run; skip on dataset {} for parameter {}".format( job.get_id(), feature_graph_name, args)) else: # Construct arguments args_split = args.split() dataset_args = dataset_args_func( dataset_dir=dataset_dir, feature_graph_name=feature_graph_name, run_id=run_id, workspaceDirObj=workspaceDirObj, task_args=task_args, featureJob=featureJob, args=args, args_split=args_split, splitJob=splitJob ) if dataset_args is None: raise ValueError( "dataset_args_func is not properly configured.") elif dataset_args is False: print("[run_model@{}] Skip on dataset {} for parameter {}".format( job.get_id(), feature_graph_name, args)) continue arg_list = [get_python_path(), "-u", modelScript] + \ dataset_args + args_split # Run model code print("[run_model@{}] run on dataset {} for parameter {}".format( job.get_id(), feature_graph_name, args)) try: logger.info( "===============\n>>>>Executing command {}\n===============".format(arg_list)) if not(job.doc.get("exp_terminal", False) or flags.log_to_terminal): ch.setLevel(logging.WARNING) ch.setFormatter(chFormatter) if task_args.interactive: proc = subprocess.Popen( arg_list, cwd=str(modelPathObj)) else: proc = subprocess.Popen(arg_list, cwd=str(modelPathObj), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='utf-8') if proc.stdout is not None: msgcount = 0 for line in iter(proc.stdout.readline, ''): msgcount += 1 logger.info(line.strip()) if msgcount % 100 == 0: logger.debug("running on dataset {} for parameter {}".format( feature_graph_name, args)) msgcount = 0 returncode = proc.wait() if returncode != 0: raise subprocess.CalledProcessError( returncode, arg_list) else: logger.debug("Completed on dataset {} for parameter {}".format( feature_graph_name, args)) except subprocess.CalledProcessError: logger.error("Check log at {}".format( workspaceDirObj / "terminal_output.log")) raise logger.info("===============") ch.setLevel(logging.INFO) # Tag job as succeeded (except when tuning) assert len(modelProject.find_jobs( filter={"run_id": run_id})) == 1 if not task_args.tuning: for job_m in modelProject.find_jobs(filter={"run_id": run_id}): job_m.doc["succeeded"] = True else: print("[run_model@{}]Job will not be tagged successful in tuning mode.".format( job.get_id())) logger.removeHandler(fh)