def train_cv(sess, graph, config): all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print(f"starting fold: {kf_count}") train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n") score_metrics.append(test_metrics[metric_name]) print(f"infer time: {infer_time}[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print(f"[SAVE] {filename}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n" f"cv {metric_name}(std.) = {np.std(score_metrics)}\n") if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) # if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): v = compute_metrics(config, info, fold_data.prediction_data, fold_data.test_labels) result_cv.append(v) save_path = config["save_result_cv"] print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "link_prediction": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) else: make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def infer(sess, graph, config): dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] if "test_label_list" in config: config["label_list"] = config["test_label_list"] all_data, info = load_data(config, filename=dataset_filename, prohibit_shuffle=True) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") # Initialize session restore_ckpt(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time if config["task"] != "link_prediction": result["test_metrics"] = compute_metrics(config, info, prediction_data, all_data.labels) save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "regression_gmfe": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "link_prediction": pass else: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: #output_left_pred = model.left_pred(all_data) #print(output_left_pred.shape) ## output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) score_list = [] print(true_label.shape) for pair in true_label[0]: if len(prediction_data[0].shape) == 2: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] elif len(prediction_data[0].shape) == 3: i1, r1, j1, i2, r2, j2 = pair s1 = pred_score[0, r1, i1, j1] s2 = pred_score[0, r2, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True) if config["prediction_data"] is not None: obj = {} obj["prediction_data"] = prediction_data obj["labels"] = all_data.labels os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True) joblib.dump(obj, config["prediction_data"], compress=True)
def train(sess, graph, config): if config["validation_dataset"] is None: _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") if valid_data.num > 0: # Validation start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"validation time: {infer_time}[sec]\n") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = valid_cost result["validation_accuracy"] = valid_metrics result["train_time"] = train_time result["infer_time"] = infer_time if config["task"] != "link_prediction": result["valid_metrics"] = compute_metrics( config, info, prediction_data, valid_data.labels) ## save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print(f"[SAVE] {config['export_model']}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename = config["save_result_valid"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression" or config[ "task"] == "regression_gmfe": # plot_cost(config, valid_data, model) plot_r2(config, valid_data.labels, np.array(prediction_data)) elif config["task"] == "link_prediction": plot_cost(config, valid_data, model) else: plot_cost(config, valid_data, model) plot_auc(config, valid_data.labels, np.array(prediction_data))