def reconstruct(sess, config): batch_size = config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print("pos_weight=", info.pos_weight) print("norm=", info.pos_weight) model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) model.build(importlib.import_module(config["model.py"]), is_train=False) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[load]", config["load_model"]) saver.restore(sess, config["load_model"]) start_t = time.time() cost, acc, pred_data = model.pred_and_eval(all_data) recons_data = pred_data """ recons_data=[] for i in range(3): print(i) cost,acc,pred_data=model.pred_and_eval(all_data) recons_data.append(pred_data) """ if "reconstruction_test" in config: filename = config["reconstruction_test"] os.makedirs(os.path.dirname(filename), exist_ok=True) print("[SAVE]", filename) joblib.dump(recons_data, filename)
def infer(sess,graph,config): batch_size=config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename=config["dataset"] if "dataset_test" in config: dataset_filename=config["dataset_test"] all_data,info=load_data(config,filename=dataset_filename) model = CoreModel(sess,config,info) model.build(importlib.import_module(config["model.py"]),is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]",config["load_model"]) saver.restore(sess,config["load_model"]) # Validation start_t = time.time() test_cost,test_metrics,prediction_data=model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =",test_cost) print("accuracy =",test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result={} result["test_cost"]=test_cost result["test_accuracy"]=test_metrics result["infer_time"]=infer_time save_path=config["save_info_test"] print("[SAVE] ",save_path) fp=open(save_path,"w") json.dump(result,fp, indent=4, cls=NumPyArangeEncoder) if config["prediction_data"] is None: print("[ERROR] prediction_data is required") quit() obj = {} obj["prediction_data"] = prediction_data obj["labels" ] = all_data.labels os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True) joblib.dump(obj,config["prediction_data"])
def train(sess,graph,config): batch_size=config["batch_size"] learning_rate=config["learning_rate"] if config["validation_dataset"] is None: _, train_data,valid_data,info = load_and_split_data(config,filename=config["dataset"],valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data(config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess,config,info) model.build(importlib.import_module(config["model.py"])) if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) writer = tf.summary.FileWriter('logs', sess.graph) # Training start_t = time.time() model.fit(train_data,valid_data) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") if valid_data.num>0: # Validation start_t = time.time() validation_cost,validation_metrics,prediction_data=model.pred_and_eval(valid_data) infer_time = time.time() - start_t print("final cost =",validation_cost) print("accuracy =",validation_metrics["accuracy"]) print("validation time:{0}".format(infer_time) + "[sec]") # Saving if config["save_info_valid"] is not None: result={} result["validation_cost"]=validation_cost result["validation_accuracy"]=validation_metrics result["train_time"]=train_time result["infer_time"]=infer_time save_path=config["save_info_valid"] print("[SAVE] ",save_path) fp=open(save_path,"w") json.dump(result,fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print("[SAVE]",config["export_model"]) graph_def = graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename=config["save_result_valid"] save_prediction(filename,prediction_data) if config["make_plot"]: plot_cost(config,valid_data,model) plot_auc(config,valid_data.labels,np.array(prediction_data))
def reconstruct(sess, config): dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print(f"pos_weight={info.pos_weight}") print(f"norm={info.norm}") model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"], is_train=False) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # initialize session restore_ckpt(sess, config["load_model"]) start_t = time.time() cost, acc, pred_data = model.pred_and_eval(all_data) recons_data = pred_data """ recons_data=[] for i in range(3): print(i) cost,acc,pred_data=model.pred_and_eval(all_data) recons_data.append(pred_data) """ if "reconstruction_test" in config: filename = config["reconstruction_test"] os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(recons_data, filename)
def generate(sess, config): batch_size = config["batch_size"] dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(all_data) info.norm = get_norm(all_data) print("pos_weight=", info.pos_weight) print("norm=", info.pos_weight) model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"], is_train=False) # initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) restore_ckpt(sess, config["load_model"]) start_t = time.time() generated_data = None #for i in range(3): #print(i) cost, acc, pred_data = model.pred_and_eval(all_data) generated_data = pred_data if "generation_test" in config: filename = config["generation_test"] dirname = os.path.dirname(filename) if dirname != "": os.makedirs(dirname, exist_ok=True) print("[SAVE]", filename) joblib.dump(generated_data, filename)
def _train(layers): config = _get_config() _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) metric_name = "accuracy" with tf.Session() as sess: model = CoreModel(sess, config, info) model.build(GCN(layers), True, False, None) model.fit(train_data, valid_data) _, valid_metrics, _ = model.pred_and_eval(valid_data) return valid_metrics[metric_name]
def load_model(model_name, config, class_num=None): """ Load trained model of Tensorflow or Keras Args: model_name (str): config (dict): class_num (int): Returns: Loaded model instance """ model = None if config['descriptor'] == 'ECFP': if model_name == 'expansion': model = models.load_model(config['expansion_model']) elif model_name == 'rollout': model = models.load_model(config['rollout_model']) if config['descriptor'] == 'GCN': graph = tf.Graph() sess = tf.Session(graph=graph) with graph.as_default(): from utils import MoleculeUtils # dummy setting mol = Chem.MolFromSmiles('C1=CC=CC=C1') input_data = None trained_model_path = None if model_name == 'expansion': gcn_config = get_config(config['gcn_expansion_config']) input_data = MoleculeUtils.generate_gcn_descriptor( mol, config['max_atom_num'], class_num) trained_model_path = config['expansion_model'] elif model_name == 'rollout': gcn_config = get_config(config['gcn_rollout_config']) input_data = MoleculeUtils.generate_gcn_descriptor( mol, config['max_atom_num'], class_num) trained_model_path = config['rollout_model'] _, info = build_data(gcn_config, input_data, verbose=False) model = CoreModel(sess, gcn_config, info) load_model_py(model, gcn_config["model.py"], is_train=False) # Initialize session saver = tf.train.Saver() saver.restore(sess, trained_model_path) return model
def visualize(sess, config, args): from kgcn.visualization import cal_feature_IG, cal_feature_IG_for_kg # input a molecule at a time batch_size = 1 dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename, prohibit_shuffle=True) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False, feed_embedded_layer=True, batch_size=batch_size) placeholders = model.placeholders restore_ckpt(sess, config['load_model']) # calculate integrated gradients if config['visualize_type'] == 'graph': cal_feature_IG(sess, all_data, placeholders, info, config, model.prediction, args.ig_modal_target, args.ig_label_target, logger=tf.logging, model=model.nn, args=args) else: cal_feature_IG_for_kg(sess, all_data, placeholders, info, config, model.prediction, logger=tf.logging, model=model.nn, args=args)
def infer(sess, graph, config): dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] if "test_label_list" in config: config["label_list"] = config["test_label_list"] all_data, info = load_data(config, filename=dataset_filename, prohibit_shuffle=True) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") # Initialize session restore_ckpt(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time if config["task"] != "link_prediction": result["test_metrics"] = compute_metrics(config, info, prediction_data, all_data.labels) save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "regression_gmfe": pred_score = np.array(prediction_data) plot_r2(config, all_data.labels, pred_score) elif config["task"] == "link_prediction": pass else: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: #output_left_pred = model.left_pred(all_data) #print(output_left_pred.shape) ## output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) score_list = [] print(true_label.shape) for pair in true_label[0]: if len(prediction_data[0].shape) == 2: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] elif len(prediction_data[0].shape) == 3: i1, r1, j1, i2, r2, j2 = pair s1 = pred_score[0, r1, i1, j1] s2 = pred_score[0, r2, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True) if config["prediction_data"] is not None: obj = {} obj["prediction_data"] = prediction_data obj["labels"] = all_data.labels os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True) joblib.dump(obj, config["prediction_data"], compress=True)
def train(sess, graph, config): from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support batch_size = config["batch_size"] learning_rate = config["learning_rate"] if config["validation_dataset"] is None: _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) writer = tf.summary.FileWriter('logs', sess.graph) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") if valid_data.num > 0: # Validation start_t = time.time() validation_cost, validation_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print("final cost =", validation_cost) print("accuracy =", validation_metrics["accuracy"]) print("validation time:{0}".format(infer_time) + "[sec]") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = validation_cost result["validation_accuracy"] = validation_metrics result["train_time"] = train_time result["infer_time"] = infer_time ## pred_score = np.array(prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(valid_data.labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score( true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result["valid_metrics"] = el ## save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print("[SAVE]", config["export_model"]) graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename = config["save_result_valid"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_cost(config, valid_data, model) plot_auc(config, valid_data.labels, np.array(prediction_data))
def train(sess, graph, config): if config["validation_dataset"] is None: _, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) metric_name = ("mse" if config["task"] == "regression" else "gmfe" if config["task"] == "regression_gmfe" else "accuracy") if config["profile"]: vars_to_train = tf.trainable_variables() print(vars_to_train) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") if valid_data.num > 0: # Validation start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"validation time: {infer_time}[sec]\n") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = valid_cost result["validation_accuracy"] = valid_metrics result["train_time"] = train_time result["infer_time"] = infer_time if config["task"] != "link_prediction": result["valid_metrics"] = compute_metrics( config, info, prediction_data, valid_data.labels) ## save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["export_model"]: try: print(f"[SAVE] {config['export_model']}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False) except: print('[ERROR] output has been not found') if config["save_result_valid"] is not None: filename = config["save_result_valid"] save_prediction(filename, prediction_data) if config["make_plot"]: if config["task"] == "regression" or config[ "task"] == "regression_gmfe": # plot_cost(config, valid_data, model) plot_r2(config, valid_data.labels, np.array(prediction_data)) elif config["task"] == "link_prediction": plot_cost(config, valid_data, model) else: plot_cost(config, valid_data, model) plot_auc(config, valid_data.labels, np.array(prediction_data))
def train_cv(sess, graph, config): all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print(f"starting fold: {kf_count}") train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print(f"training time: {train_time}[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print(f"final cost = {valid_cost}\n" f"{metric_name} = {valid_metrics[metric_name]}\n" f"infer time: {infer_time}[sec]\n") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print(f"final cost = {test_cost}\n" f"{metric_name} = {test_metrics[metric_name]}\n") score_metrics.append(test_metrics[metric_name]) print(f"infer time: {infer_time}[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print(f"[SAVE] {filename}") graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n" f"cv {metric_name}(std.) = {np.std(score_metrics)}\n") if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) # if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") _, ext = os.path.splitext(save_path) if ext == ".json": with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): v = compute_metrics(config, info, fold_data.prediction_data, fold_data.test_labels) result_cv.append(v) save_path = config["save_result_cv"] print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "link_prediction": make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) else: make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path, prefix=prefix) pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def infer(sess, graph, config): batch_size = config["batch_size"] dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("accuracy =", test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) test_idx = all_data.test_data_idx score_list = [] print(true_label.shape) for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True)
def train_cv(sess, graph, config): from sklearn.model_selection import KFold, StratifiedKFold from kgcn.make_plots import make_auc_plot, make_cost_acc_plot import sklearn from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support from scipy import interp batch_size = config["batch_size"] learning_rate = config["learning_rate"] all_data, info = load_data( config, filename=config["dataset"], prohibit_shuffle=True) # shuffle is done by KFold model = CoreModel(sess, config, info) load_model_py(model, config["model.py"]) # Training if config["stratified_kfold"]: print("[INFO] use stratified K-fold") kf = StratifiedKFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) else: kf = KFold(n_splits=config["k-fold_num"], shuffle=config["shuffle_data"], random_state=123) kf_count = 1 fold_data_list = [] output_data_list = [] if all_data["labels"] is not None: split_base = all_data["labels"] else: split_base = all_data["label_list"][0] if config["stratified_kfold"]: split_base = np.argmax(split_base, axis=1) score_metrics = [] if config["task"] == "regression": metric_name = "mse" elif config["task"] == "regression_gmfe": metric_name = "gmfe" else: metric_name = "accuracy" split_data_generator = kf.split( split_base, split_base) if config["stratified_kfold"] else kf.split(split_base) for train_valid_list, test_list in split_data_generator: print("starting fold:{0}".format(kf_count)) train_valid_data, test_data = split_data( all_data, indices_for_train_data=train_valid_list, indices_for_valid_data=test_list) train_data, valid_data = split_data( train_valid_data, valid_data_rate=config["validation_data_rate"]) # Training print(train_valid_list) print(test_list) start_t = time.time() model.fit(train_data, valid_data, k_fold_num=kf_count) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") # Test print("== valid data ==") start_t = time.time() valid_cost, valid_metrics, prediction_data = model.pred_and_eval( valid_data) infer_time = time.time() - start_t print("final cost =", valid_cost) print("%s =%f" % (metric_name, valid_metrics[metric_name])) print("infer time:{0}".format(infer_time) + "[sec]") print("== test data ==") start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval( test_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("%s =%f" % (metric_name, test_metrics[metric_name])) score_metrics.append(test_metrics[metric_name]) print("infer time:{0}".format(infer_time) + "[sec]") if config["export_model"]: try: name, ext = os.path.splitext(config["export_model"]) filename = name + "." + str(kf_count) + ext print("[SAVE]", filename) graph_def = graph_util.convert_variables_to_constants( sess, graph.as_graph_def(), ['output']) tf.train.write_graph(graph_def, '.', filename, as_text=False) except: print('[ERROR] output has been not found') if "save_edge_result_cv" in config: output_data = model.output(test_data) output_data_list.append(output_data) # save fold data fold_data = dotdict({}) fold_data.prediction_data = prediction_data if all_data["labels"] is not None: fold_data.test_labels = test_data.labels else: fold_data.test_labels = test_data.label_list fold_data.test_data_idx = test_list if config["task"] == "regression": fold_data.training_mse = [ el["training_mse"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_mse"] for el in model.validation_metrics_list ] elif config["task"] == "regression_gmfe": fold_data.training_mse = [ el["training_gmfe"] for el in model.training_metrics_list ] fold_data.validation_mse = [ el["validation_gmfe"] for el in model.validation_metrics_list ] else: fold_data.training_acc = [ el["training_accuracy"] for el in model.training_metrics_list ] fold_data.validation_acc = [ el["validation_accuracy"] for el in model.validation_metrics_list ] fold_data.test_acc = test_metrics[metric_name] fold_data.training_cost = model.training_cost_list fold_data.validation_cost = model.validation_cost_list fold_data.test_cost = test_cost fold_data.train_time = train_time fold_data.infer_time = infer_time fold_data_list.append(fold_data) kf_count += 1 print("cv %s(mean) =%f" % (metric_name, np.mean(score_metrics))) print("cv %s(std.) =%f" % (metric_name, np.std(score_metrics))) if "save_info_cv" in config and config["save_info_cv"] is not None: save_path = config["save_info_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold_data_list, save_path, compress=True) ## if "save_edge_result_cv" in config and config[ "save_edge_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) true_label = np.array(fold_data.test_labels) test_idx = fold_data.test_data_idx score_list = [] for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data_list[j][0] fold["score"] = np.array(score_list) fold["test_data_idx"] = test_idx result_cv.append(fold) save_path = config["save_edge_result_cv"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(result_cv, save_path, compress=True) # # if "save_result_cv" in config and config["save_result_cv"] is not None: result_cv = [] for j, fold_data in enumerate(fold_data_list): pred_score = np.array(fold_data.prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(fold_data.test_labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score( true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result_cv.append(v) save_path = config["save_result_cv"] print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder) # for i, fold_data in enumerate(fold_data_list): prefix = "fold" + str(i) + "_" result_path = config["plot_path"] os.makedirs(result_path, exist_ok=True) if config["make_plot"]: if config["task"] == "regression": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) elif config["task"] == "regression_gmfe": # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_mse, fold_data.validation_mse, result_path + prefix) pred_score = np.array(fold_data.prediction_data) plot_r2(config, fold_data.test_labels, pred_score, prefix=prefix) else: # plot cost make_cost_acc_plot(fold_data.training_cost, fold_data.validation_cost, fold_data.training_acc, fold_data.validation_acc, result_path + prefix) # plot AUC pred_score = np.array(fold_data.prediction_data) plot_auc(config, fold_data.test_labels, pred_score, prefix=prefix)
def train(sess, config): if config["validation_dataset"] is None: all_data, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) else: print("[INFO] training") train_data, info = load_data(config, filename=config["dataset"]) print("[INFO] validation") valid_data, valid_info = load_data( config, filename=config["validation_dataset"]) info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"]) info["graph_num"] = info["graph_num"] + valid_info["graph_num"] # train model graph_index_list = [] for i in range(info["graph_num"]): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(train_data) info.norm = get_norm(train_data) print(f"pos_weight={info.pos_weight}") print(f"norm={info.norm}") model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) load_model_py(model, config["model.py"]) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print(f"training time:{train_time}[sec]") # Validation start_t = time.time() validation_cost, validation_accuracy, validation_prediction_data = model.pred_and_eval( valid_data) training_cost, training_accuracy, training_prediction_data = model.pred_and_eval( train_data) infer_time = time.time() - start_t print(f"final cost(training ) = {training_cost}\n" f"accuracy (training ) = {training_accuracy['accuracy']}\n" f"final cost(validation) = {validation_cost}\n" f"accuracy (validation) = {validation_accuracy['accuracy']}\n" f"infer time:{infer_time}[sec]\n") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = validation_cost result["validation_accuracy"] = validation_accuracy["accuracy"] result["train_time"] = train_time result["infer_time"] = infer_time save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4) if config["save_info_train"] is not None: result = {} result["test_cost"] = training_cost result["test_accuracy"] = training_accuracy["accuracy"] result["train_time"] = train_time save_path = config["save_info_train"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print(f"[SAVE] {save_path}") with open(save_path, "w") as fp: json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if "reconstruction_valid" in config: filename = config["reconstruction_valid"] print(os.path.dirname(filename)) os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(validation_prediction_data, filename) if "reconstruction_train" in config: filename = config["reconstruction_train"] os.makedirs(os.path.dirname(filename), exist_ok=True) print(f"[SAVE] {filename}") joblib.dump(training_prediction_data, filename)
def train(sess, config): batch_size = config["batch_size"] learning_rate = config["learning_rate"] all_data, train_data, valid_data, info = load_and_split_data( config, filename=config["dataset"], valid_data_rate=config["validation_data_rate"]) # train model graph_index_list = [] for i in range(all_data.num): graph_index_list.append([i, i]) info.graph_index_list = graph_index_list info.pos_weight = get_pos_weight(train_data) info.norm = get_norm(train_data) print("pos_weight=", info.pos_weight) print("norm=", info.pos_weight) model = CoreModel(sess, config, info, construct_feed_callback=construct_feed) model.build(importlib.import_module(config["model.py"])) vars_to_train = tf.trainable_variables() for v in vars_to_train: print(v) # Training start_t = time.time() model.fit(train_data, valid_data) train_time = time.time() - start_t print("traing time:{0}".format(train_time) + "[sec]") # Validation start_t = time.time() validation_cost, validation_accuracy, validation_prediction_data = model.pred_and_eval( valid_data) training_cost, training_accuracy, training_prediction_data = model.pred_and_eval( train_data) infer_time = time.time() - start_t print("final cost(training ) =", training_cost) print("accuracy (training ) =", training_accuracy["accuracy"]) print("final cost(validation) =", validation_cost) print("accuracy (validation) =", validation_accuracy["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") # Saving if config["save_info_valid"] is not None: result = {} result["validation_cost"] = validation_cost result["validation_accuracy"] = validation_accuracy["accuracy"] result["train_time"] = train_time result["infer_time"] = infer_time save_path = config["save_info_valid"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4) if config["save_info_train"] is not None: result = {} result["test_cost"] = training_cost result["test_accuracy"] = training_accuracy["accuracy"] result["train_time"] = train_time save_path = config["save_info_train"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if "reconstruction_valid" in config: filename = config["reconstruction_valid"] print(os.path.dirname(filename)) os.makedirs(os.path.dirname(filename), exist_ok=True) print("[SAVE]", filename) joblib.dump(validation_prediction_data, filename) if "reconstruction_train" in config: filename = config["reconstruction_train"] os.makedirs(os.path.dirname(filename), exist_ok=True) print("[SAVE]", filename) joblib.dump(training_prediction_data, filename)
def infer(sess, graph, config): from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support batch_size = config["batch_size"] model = importlib.import_module(config["model.py"]) dataset_filename = config["dataset"] if "dataset_test" in config: dataset_filename = config["dataset_test"] all_data, info = load_data(config, filename=dataset_filename) model = CoreModel(sess, config, info) load_model_py(model, config["model.py"], is_train=False) # Initialize session saver = tf.train.Saver() #sess.run(tf.global_variables_initializer()) print("[LOAD]", config["load_model"]) saver.restore(sess, config["load_model"]) # Validation start_t = time.time() test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data) infer_time = time.time() - start_t print("final cost =", test_cost) print("accuracy =", test_metrics["accuracy"]) print("infer time:{0}".format(infer_time) + "[sec]") if config["save_info_test"] is not None: result = {} result["test_cost"] = test_cost result["test_accuracy"] = test_metrics result["infer_time"] = infer_time ## pred_score = np.array(prediction_data) if len(pred_score.shape) == 3: # multi-label-multi-task # #data x # task x #class # => this program supports only 2 labels pred_score = pred_score[:, :, 1] true_label = np.array(all_data.labels) # #data x # task x #class if len(pred_score.shape) == 1: pred_score = pred_score[:, np.newaxis] if len(true_label.shape) == 1: true_label = true_label[:, np.newaxis] v = [] for i in range(info.label_dim): el = {} if config["task"] == "regression": el["r2"] = sklearn.metrics.r2_score(true_label[:, i], pred_score[:, i]) el["mse"] = sklearn.metrics.mean_squared_error( true_label[:, i], pred_score[:, i]) elif config["task"] == "regression_gmfe": el["gmfe"] = np.exp( np.mean(np.log(true_label[:, i] / pred_score[:, i]))) else: pred = np.zeros(pred_score.shape) pred[pred_score > 0.5] = 1 fpr, tpr, _ = roc_curve(true_label[:, i], pred_score[:, i], pos_label=1) roc_auc = auc(fpr, tpr) acc = accuracy_score(true_label[:, i], pred[:, i]) scores = precision_recall_fscore_support(true_label[:, i], pred[:, i], average='binary') el["auc"] = roc_auc el["acc"] = acc el["pre"] = scores[0] el["rec"] = scores[1] el["f"] = scores[2] el["sup"] = scores[3] v.append(el) result["test_metrics"] = el ## save_path = config["save_info_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) fp = open(save_path, "w") json.dump(result, fp, indent=4, cls=NumPyArangeEncoder) if config["save_result_test"] is not None: filename = config["save_result_test"] save_prediction(filename, prediction_data) if config["make_plot"]: plot_auc(config, all_data.labels, np.array(prediction_data)) if "save_edge_result_test" in config and config[ "save_edge_result_test"] is not None: output_data = model.output(all_data) pred_score = np.array(prediction_data) true_label = np.array(all_data.label_list) test_idx = all_data.test_data_idx score_list = [] print(true_label.shape) for pair in true_label[0]: i1, _, j1, i2, _, j2 = pair s1 = pred_score[0, i1, j1] s2 = pred_score[0, i2, j2] score_list.append([s1, s2]) fold = {} fold["output"] = output_data[0] fold["score"] = np.array(score_list) save_path = config["save_edge_result_test"] os.makedirs(os.path.dirname(save_path), exist_ok=True) print("[SAVE] ", save_path) _, ext = os.path.splitext(save_path) if ext == ".json": fp = open(save_path, "w") json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder) else: joblib.dump(fold, save_path, compress=True)