Esempio n. 1
0
def train(sess,graph,config):
    batch_size=config["batch_size"]
    learning_rate=config["learning_rate"]
    
    if config["validation_dataset"] is None:
        _, train_data,valid_data,info = load_and_split_data(config,filename=config["dataset"],valid_data_rate=config["validation_data_rate"])
    else:
        print("[INFO] training")
        train_data, info = load_data(config, filename=config["dataset"])
        print("[INFO] validation")
        valid_data, valid_info = load_data(config, filename=config["validation_dataset"])
        info["graph_node_num"] = max(info["graph_node_num"], valid_info["graph_node_num"])
        info["graph_num"] = info["graph_num"] + valid_info["graph_num"]

    model = CoreModel(sess,config,info)
    model.build(importlib.import_module(config["model.py"]))
    if config["profile"]:
        vars_to_train = tf.trainable_variables()
        print(vars_to_train)
        writer = tf.summary.FileWriter('logs', sess.graph)
    
    # Training
    start_t = time.time()
    model.fit(train_data,valid_data)
    train_time = time.time() - start_t
    print("traing time:{0}".format(train_time) + "[sec]")
    if valid_data.num>0:
        # Validation
        start_t = time.time()
        validation_cost,validation_metrics,prediction_data=model.pred_and_eval(valid_data)
        infer_time = time.time() - start_t
        print("final cost =",validation_cost)
        print("accuracy   =",validation_metrics["accuracy"])
        print("validation time:{0}".format(infer_time) + "[sec]")
    # Saving
    if config["save_info_valid"] is not None:
        result={}
        result["validation_cost"]=validation_cost
        result["validation_accuracy"]=validation_metrics
        result["train_time"]=train_time
        result["infer_time"]=infer_time
        save_path=config["save_info_valid"]
        print("[SAVE] ",save_path)
        fp=open(save_path,"w")
        json.dump(result,fp, indent=4, cls=NumPyArangeEncoder)
    
    if config["export_model"]:
        try:
            print("[SAVE]",config["export_model"])
            graph_def = graph_util.convert_variables_to_constants(sess, graph.as_graph_def(), ['output'])
            tf.train.write_graph(graph_def, '.', config["export_model"], as_text=False)
        except:
            print('[ERROR] output has been not found')
    if config["save_result_valid"] is not None:
        filename=config["save_result_valid"]
        save_prediction(filename,prediction_data)
    if config["make_plot"]:
        plot_cost(config,valid_data,model)
        plot_auc(config,valid_data.labels,np.array(prediction_data))
Esempio n. 2
0
def reconstruct(sess, config):
    batch_size = config["batch_size"]
    model = importlib.import_module(config["model.py"])
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config, filename=dataset_filename)

    graph_index_list = []
    for i in range(all_data.num):
        graph_index_list.append([i, i])
    info.graph_index_list = graph_index_list
    info.pos_weight = get_pos_weight(all_data)
    info.norm = get_norm(all_data)
    print("pos_weight=", info.pos_weight)
    print("norm=", info.pos_weight)

    model = CoreModel(sess,
                      config,
                      info,
                      construct_feed_callback=construct_feed)
    model.build(importlib.import_module(config["model.py"]), is_train=False)

    vars_to_train = tf.trainable_variables()
    for v in vars_to_train:
        print(v)

    # initialize session
    saver = tf.train.Saver()
    #sess.run(tf.global_variables_initializer())
    print("[load]", config["load_model"])
    saver.restore(sess, config["load_model"])

    start_t = time.time()
    cost, acc, pred_data = model.pred_and_eval(all_data)
    recons_data = pred_data
    """
    recons_data=[]
    for i in range(3):
        print(i)
        cost,acc,pred_data=model.pred_and_eval(all_data)
        recons_data.append(pred_data)
    """
    if "reconstruction_test" in config:
        filename = config["reconstruction_test"]
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        print("[SAVE]", filename)
        joblib.dump(recons_data, filename)
Esempio n. 3
0
def infer(sess,graph,config):
    batch_size=config["batch_size"]
    model = importlib.import_module(config["model.py"])
    dataset_filename=config["dataset"]
    if "dataset_test" in config:
        dataset_filename=config["dataset_test"]
    all_data,info=load_data(config,filename=dataset_filename)
    
    model = CoreModel(sess,config,info)
    model.build(importlib.import_module(config["model.py"]),is_train=False)
    
    # Initialize session
    saver = tf.train.Saver()
    #sess.run(tf.global_variables_initializer())
    print("[LOAD]",config["load_model"])
    saver.restore(sess,config["load_model"])

    # Validation
    start_t = time.time()
    test_cost,test_metrics,prediction_data=model.pred_and_eval(all_data)
    infer_time = time.time() - start_t
    print("final cost =",test_cost)
    print("accuracy   =",test_metrics["accuracy"])
    print("infer time:{0}".format(infer_time) + "[sec]")
    
    if config["save_info_test"] is not None:
        result={}
        result["test_cost"]=test_cost
        result["test_accuracy"]=test_metrics
        result["infer_time"]=infer_time
        save_path=config["save_info_test"]
        print("[SAVE] ",save_path)
        fp=open(save_path,"w")
        json.dump(result,fp, indent=4, cls=NumPyArangeEncoder)
    
    if config["prediction_data"] is None:
        print("[ERROR] prediction_data is required")
        quit()

    obj = {}
    obj["prediction_data"] = prediction_data
    obj["labels"         ] = all_data.labels
        
    os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True)
    joblib.dump(obj,config["prediction_data"])
Esempio n. 4
0
def reconstruct(sess, config):
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config, filename=dataset_filename)

    graph_index_list = []
    for i in range(all_data.num):
        graph_index_list.append([i, i])
    info.graph_index_list = graph_index_list
    info.pos_weight = get_pos_weight(all_data)
    info.norm = get_norm(all_data)
    print(f"pos_weight={info.pos_weight}")
    print(f"norm={info.norm}")

    model = CoreModel(sess,
                      config,
                      info,
                      construct_feed_callback=construct_feed)
    load_model_py(model, config["model.py"], is_train=False)

    vars_to_train = tf.trainable_variables()
    for v in vars_to_train:
        print(v)

    # initialize session
    restore_ckpt(sess, config["load_model"])

    start_t = time.time()
    cost, acc, pred_data = model.pred_and_eval(all_data)
    recons_data = pred_data
    """
    recons_data=[]
    for i in range(3):
        print(i)
        cost,acc,pred_data=model.pred_and_eval(all_data)
        recons_data.append(pred_data)
    """
    if "reconstruction_test" in config:
        filename = config["reconstruction_test"]
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        print(f"[SAVE] {filename}")
        joblib.dump(recons_data, filename)
Esempio n. 5
0
File: gcn.py Progetto: 0h-n0/kGCN
def visualize(sess, config, args):
    from tensorflow.python import debug as tf_debug
    from kgcn.visualization import cal_feature_IG
    # 入力は1分子づつ
    batch_size = 1
    # 入力データから、全データの情報, 学習用データの情報, 検証用データの情報, および
    # グラフに関する情報を順に取得する
    all_data, info = load_data(config,
                               filename=config["dataset"],
                               prohibit_shuffle=True)
    #all_data.labels = tf.one_hot(tf.cast(tf.squeeze(all_data.labels), tf.int32), depth=2)
    model = importlib.import_module(config["model.py"])
    placeholders = model.build_placeholders(info,
                                            config,
                                            batch_size=batch_size)
    try:
        _model, prediction, _, _, _ = model.build_model(
            placeholders,
            info,
            config,
            batch_size=batch_size,
            feed_embedded_layer=True)
    except:
        _model, prediction, _, _, _ = model.build_model(placeholders,
                                                        info,
                                                        config,
                                                        batch_size=batch_size)
    #--- セッションの初期化
    saver = tf.train.Saver()
    print("[LOAD]", config["load_model"])

    saver.restore(sess, config["load_model"])
    #--- integrated gradientsの計算
    cal_feature_IG(sess,
                   all_data,
                   placeholders,
                   info,
                   prediction,
                   args.ig_modal_target,
                   args.ig_label_target,
                   logger=tf.logging,
                   model=_model)
Esempio n. 6
0
def visualize(sess, config, args):
    from kgcn.visualization import cal_feature_IG, cal_feature_IG_for_kg
    # input a molecule at a time
    batch_size = 1
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config,
                               filename=dataset_filename,
                               prohibit_shuffle=True)

    model = CoreModel(sess, config, info)
    load_model_py(model,
                  config["model.py"],
                  is_train=False,
                  feed_embedded_layer=True,
                  batch_size=batch_size)
    placeholders = model.placeholders
    restore_ckpt(sess, config['load_model'])
    # calculate integrated gradients
    if config['visualize_type'] == 'graph':
        cal_feature_IG(sess,
                       all_data,
                       placeholders,
                       info,
                       config,
                       model.prediction,
                       args.ig_modal_target,
                       args.ig_label_target,
                       logger=tf.logging,
                       model=model.nn,
                       args=args)
    else:
        cal_feature_IG_for_kg(sess,
                              all_data,
                              placeholders,
                              info,
                              config,
                              model.prediction,
                              logger=tf.logging,
                              model=model.nn,
                              args=args)
Esempio n. 7
0
def generate(sess, config):
    batch_size = config["batch_size"]
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config, filename=dataset_filename)

    graph_index_list = []
    for i in range(all_data.num):
        graph_index_list.append([i, i])
    info.graph_index_list = graph_index_list
    info.pos_weight = get_pos_weight(all_data)
    info.norm = get_norm(all_data)
    print("pos_weight=", info.pos_weight)
    print("norm=", info.pos_weight)

    model = CoreModel(sess,
                      config,
                      info,
                      construct_feed_callback=construct_feed)
    load_model_py(model, config["model.py"], is_train=False)
    # initialize session
    saver = tf.train.Saver()
    #sess.run(tf.global_variables_initializer())
    restore_ckpt(sess, config["load_model"])

    start_t = time.time()
    generated_data = None
    #for i in range(3):
    #print(i)
    cost, acc, pred_data = model.pred_and_eval(all_data)
    generated_data = pred_data

    if "generation_test" in config:
        filename = config["generation_test"]
        dirname = os.path.dirname(filename)
        if dirname != "": os.makedirs(dirname, exist_ok=True)
        print("[SAVE]", filename)
        joblib.dump(generated_data, filename)
Esempio n. 8
0
def train_cv(sess, graph, config):
    all_data, info = load_data(
        config, filename=config["dataset"],
        prohibit_shuffle=True)  # shuffle is done by KFold
    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])
    # Training
    if config["stratified_kfold"]:
        print("[INFO] use stratified K-fold")
        kf = StratifiedKFold(n_splits=config["k-fold_num"],
                             shuffle=config["shuffle_data"],
                             random_state=123)
    else:
        kf = KFold(n_splits=config["k-fold_num"],
                   shuffle=config["shuffle_data"],
                   random_state=123)

    kf_count = 1
    fold_data_list = []
    output_data_list = []
    if all_data["labels"] is not None:
        split_base = all_data["labels"]
    else:
        split_base = all_data["label_list"][0]
    if config["stratified_kfold"]:
        split_base = np.argmax(split_base, axis=1)
    score_metrics = []
    if config["task"] == "regression":
        metric_name = "mse"
    elif config["task"] == "regression_gmfe":
        metric_name = "gmfe"
    else:
        metric_name = "accuracy"
    split_data_generator = kf.split(
        split_base,
        split_base) if config["stratified_kfold"] else kf.split(split_base)
    for train_valid_list, test_list in split_data_generator:
        print(f"starting fold: {kf_count}")
        train_valid_data, test_data = split_data(
            all_data,
            indices_for_train_data=train_valid_list,
            indices_for_valid_data=test_list)

        train_data, valid_data = split_data(
            train_valid_data, valid_data_rate=config["validation_data_rate"])
        # Training
        print(train_valid_list)
        print(test_list)
        start_t = time.time()
        model.fit(train_data, valid_data, k_fold_num=kf_count)
        train_time = time.time() - start_t
        print(f"training time: {train_time}[sec]")
        # Test
        print("== valid data ==")
        start_t = time.time()
        valid_cost, valid_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print(f"final cost = {valid_cost}\n"
              f"{metric_name} = {valid_metrics[metric_name]}\n"
              f"infer time: {infer_time}[sec]\n")
        print("== test data ==")
        start_t = time.time()
        test_cost, test_metrics, prediction_data = model.pred_and_eval(
            test_data)
        infer_time = time.time() - start_t
        print(f"final cost = {test_cost}\n"
              f"{metric_name} = {test_metrics[metric_name]}\n")
        score_metrics.append(test_metrics[metric_name])
        print(f"infer time: {infer_time}[sec]")

        if config["export_model"]:
            try:
                name, ext = os.path.splitext(config["export_model"])
                filename = name + "." + str(kf_count) + ext
                print(f"[SAVE] {filename}")
                graph_def = graph_util.convert_variables_to_constants(
                    sess, graph.as_graph_def(), ['output'])
                tf.train.write_graph(graph_def, '.', filename, as_text=False)
            except:
                print('[ERROR] output has been not found')
        if "save_edge_result_cv" in config:
            output_data = model.output(test_data)
            output_data_list.append(output_data)
        # save fold data
        fold_data = dotdict({})
        fold_data.prediction_data = prediction_data
        if all_data["labels"] is not None:
            fold_data.test_labels = test_data.labels
        else:
            fold_data.test_labels = test_data.label_list
        fold_data.test_data_idx = test_list
        if config["task"] == "regression":
            fold_data.training_mse = [
                el["training_mse"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_mse"] for el in model.validation_metrics_list
            ]
        elif config["task"] == "regression_gmfe":
            fold_data.training_mse = [
                el["training_gmfe"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_gmfe"] for el in model.validation_metrics_list
            ]
        else:
            fold_data.training_acc = [
                el["training_accuracy"] for el in model.training_metrics_list
            ]
            fold_data.validation_acc = [
                el["validation_accuracy"]
                for el in model.validation_metrics_list
            ]
        fold_data.test_acc = test_metrics[metric_name]
        fold_data.training_cost = model.training_cost_list
        fold_data.validation_cost = model.validation_cost_list
        fold_data.test_cost = test_cost
        fold_data.train_time = train_time
        fold_data.infer_time = infer_time
        fold_data_list.append(fold_data)
        kf_count += 1

    print(f"cv {metric_name}(mean) = {np.mean(score_metrics)}\n"
          f"cv {metric_name}(std.)   = {np.std(score_metrics)}\n")
    if "save_info_cv" in config and config["save_info_cv"] is not None:
        save_path = config["save_info_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            with open(save_path, "w") as fp:
                json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold_data_list, save_path, compress=True)
    #
    if "save_edge_result_cv" in config and config[
            "save_edge_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            true_label = np.array(fold_data.test_labels)
            test_idx = fold_data.test_data_idx
            score_list = []
            for pair in true_label[0]:
                i1, _, j1, i2, _, j2 = pair
                s1 = pred_score[0, i1, j1]
                s2 = pred_score[0, i2, j2]
                score_list.append([s1, s2])
            fold = {}
            fold["output"] = output_data_list[j][0]
            fold["score"] = np.array(score_list)
            fold["test_data_idx"] = test_idx
            result_cv.append(fold)
        save_path = config["save_edge_result_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            with open(save_path, "w") as fp:
                json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(result_cv, save_path, compress=True)
    #
    if "save_result_cv" in config and config["save_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            v = compute_metrics(config, info, fold_data.prediction_data,
                                fold_data.test_labels)
            result_cv.append(v)
        save_path = config["save_result_cv"]
        print(f"[SAVE] {save_path}")
        with open(save_path, "w") as fp:
            json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
    #
    for i, fold_data in enumerate(fold_data_list):
        prefix = "fold" + str(i) + "_"
        result_path = config["plot_path"]
        os.makedirs(result_path, exist_ok=True)
        if config["make_plot"]:
            if config["task"] == "regression":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "regression_gmfe":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "link_prediction":
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path,
                                   prefix=prefix)
            else:
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path,
                                   prefix=prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_auc(config,
                         fold_data.test_labels,
                         pred_score,
                         prefix=prefix)
Esempio n. 9
0
def train(sess, graph, config):
    if config["validation_dataset"] is None:
        _, train_data, valid_data, info = load_and_split_data(
            config,
            filename=config["dataset"],
            valid_data_rate=config["validation_data_rate"])
    else:
        print("[INFO] training")
        train_data, info = load_data(config, filename=config["dataset"])
        print("[INFO] validation")
        valid_data, valid_info = load_data(
            config, filename=config["validation_dataset"])
        info["graph_node_num"] = max(info["graph_node_num"],
                                     valid_info["graph_node_num"])
        info["graph_num"] = info["graph_num"] + valid_info["graph_num"]

    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])

    metric_name = ("mse" if config["task"] == "regression" else "gmfe"
                   if config["task"] == "regression_gmfe" else "accuracy")

    if config["profile"]:
        vars_to_train = tf.trainable_variables()
        print(vars_to_train)

    # Training
    start_t = time.time()
    model.fit(train_data, valid_data)
    train_time = time.time() - start_t
    print(f"training time: {train_time}[sec]")
    if valid_data.num > 0:
        # Validation
        start_t = time.time()
        valid_cost, valid_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print(f"final cost = {valid_cost}\n"
              f"{metric_name} = {valid_metrics[metric_name]}\n"
              f"validation time: {infer_time}[sec]\n")
        # Saving
        if config["save_info_valid"] is not None:
            result = {}
            result["validation_cost"] = valid_cost
            result["validation_accuracy"] = valid_metrics
            result["train_time"] = train_time
            result["infer_time"] = infer_time
            if config["task"] != "link_prediction":
                result["valid_metrics"] = compute_metrics(
                    config, info, prediction_data, valid_data.labels)
            ##
            save_path = config["save_info_valid"]
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            print(f"[SAVE] {save_path}")
            with open(save_path, "w") as fp:
                json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if config["export_model"]:
        try:
            print(f"[SAVE] {config['export_model']}")
            graph_def = graph_util.convert_variables_to_constants(
                sess, graph.as_graph_def(), ['output'])
            tf.train.write_graph(graph_def,
                                 '.',
                                 config["export_model"],
                                 as_text=False)
        except:
            print('[ERROR] output has been not found')
    if config["save_result_valid"] is not None:
        filename = config["save_result_valid"]
        save_prediction(filename, prediction_data)
    if config["make_plot"]:
        if config["task"] == "regression" or config[
                "task"] == "regression_gmfe":
            # plot_cost(config, valid_data, model)
            plot_r2(config, valid_data.labels, np.array(prediction_data))
        elif config["task"] == "link_prediction":
            plot_cost(config, valid_data, model)
        else:
            plot_cost(config, valid_data, model)
            plot_auc(config, valid_data.labels, np.array(prediction_data))
Esempio n. 10
0
def train(sess, config):
    batch_size = config["batch_size"]
    learning_rate = config["learning_rate"]
    model = importlib.import_module(config["model.py"])
    all_data, info = load_data(config, filename=config["dataset"])
    placeholders = model.build_placeholders(
        info, batch_size=batch_size, adj_channel_num=info.adj_channel_num)
    _, prediction, cost, cost_sum, metrics = model.build_model(
        placeholders,
        info,
        batch_size=batch_size,
        adj_channel_num=info.adj_channel_num,
        embedding_dim=config["embedding_dim"])
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)

    #train_step = tf.train.MomentumOptimizer(learning_rate,0.01).minimize(cost)
    # Initialize session
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    # Train model
    all_list_num = len(info.graph_index_list)
    print("#graph_index list = ", all_list_num)
    print("#data            = ", all_data.num)

    data_idx = list(range(len(info.graph_index_list)))
    n = int(all_list_num * 0.8)
    train_idx = data_idx[:n]
    train_num = len(train_idx)
    valid_idx = data_idx[n:]
    valid_num = len(valid_idx)
    eary_stopping = EarlyStopping(config)
    start_t = time.time()
    for epoch in range(config["epoch"]):  #[range(FLAGS.epochs):
        np.random.shuffle(train_idx)
        # training
        itr_num = int(np.ceil(train_num / batch_size))
        training_cost = 0
        training_correct_count = 0
        for itr in range(itr_num):
            offset_b = itr * batch_size
            batch_idx = train_idx[offset_b:offset_b + batch_size]
            feed_dict = construct_feed(batch_idx,
                                       placeholders,
                                       all_data,
                                       info.graph_index_list,
                                       batch_size=batch_size,
                                       dropout_rate=0.5)
            # running parameter update with tensorflow
            out_prediction = sess.run([prediction], feed_dict=feed_dict)
            #print(out_prediction)
            _, out_cost_sum, out_metrics = sess.run(
                [train_step, cost_sum, metrics], feed_dict=feed_dict)
            training_cost += out_cost_sum
            training_correct_count += out_metrics["correct_count"]
            #print(out_metrics["correct_count"])
            #print(batch_size)
        training_cost /= train_num
        training_accuracy = training_correct_count / train_num

        # validation
        itr_num = int(np.ceil(valid_num / batch_size))
        validation_cost = 0
        validation_correct_count = 0
        for itr in range(itr_num):
            offset_b = itr * batch_size
            batch_idx = valid_idx[offset_b:offset_b + batch_size]
            feed_dict = construct_feed(batch_idx,
                                       placeholders,
                                       all_data,
                                       info.graph_index_list,
                                       batch_size=batch_size)
            out_cost_sum, out_metrics = sess.run([cost_sum, metrics],
                                                 feed_dict=feed_dict)
            validation_cost += out_cost_sum
            validation_correct_count += out_metrics["correct_count"]
        validation_cost /= valid_num
        validation_accuracy = validation_correct_count / valid_num

        # check point
        save_path = None
        if (epoch) % config["save_interval"] == 0:
            # save
            save_path = config["save_model_path"] + "/model.%05d.ckpt" % (
                epoch)
            saver.save(sess, save_path)
        # early stopping and printing information
        if eary_stopping.evaluate_validation(
                validation_cost, {
                    "epoch": epoch,
                    "validation_accuracy": validation_accuracy,
                    "validation_cost": validation_cost,
                    "training_accuracy": training_accuracy,
                    "training_cost": training_cost,
                    "save_path": save_path
                }):
            break

    train_time = time.time() - start_t
    print("traing time:{0}".format(train_time) + "[sec]")

    # saving last model
    #save_path =  config["save_model_path"]+"/model.last.ckpt"
    if "save_model" in config and config["save_model"] is not None:
        save_path = config["save_model"]
        print("[SAVE] ", save_path)
        saver.save(sess, save_path)
    # validation
    start_t = time.time()
    data_idx = list(range(all_data.num))
    itr_num = int(np.ceil(all_data.num / batch_size))
    validation_cost = 0
    validation_correct_count = 0
    prediction_data = []
    for itr in range(itr_num):
        offset_b = itr * batch_size
        batch_idx = data_idx[offset_b:offset_b + batch_size]
        feed_dict = construct_feed(batch_idx,
                                   placeholders,
                                   all_data,
                                   batch_size=batch_size)
        out_cost_sum, out_metrics, out_prediction = sess.run(
            [cost_sum, metrics, prediction], feed_dict=feed_dict)
        validation_cost += out_cost_sum
        validation_correct_count += out_metrics["correct_count"]
        prediction_data.append(out_prediction)

    validation_cost /= all_data.num
    validation_accuracy = validation_correct_count / all_data.num
    print("final cost =", validation_cost)
    print("accuracy   =", validation_accuracy)
    train_time = time.time() - start_t
    print("infer time:{0}".format(train_time) + "[sec]")
    if "save_result_train" in config:
        filename = config["save_result_train"]
        save_prediction(filename, prediction_data)
Esempio n. 11
0
File: gcn.py Progetto: 0h-n0/kGCN
def infer(sess, graph, config):
    batch_size = config["batch_size"]
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config, filename=dataset_filename)

    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"], is_train=False)

    # Initialize session
    saver = tf.train.Saver()
    #sess.run(tf.global_variables_initializer())
    print("[LOAD]", config["load_model"])
    saver.restore(sess, config["load_model"])

    # Validation
    start_t = time.time()
    test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data)
    infer_time = time.time() - start_t
    print("final cost =", test_cost)
    print("accuracy   =", test_metrics["accuracy"])
    print("infer time:{0}".format(infer_time) + "[sec]")

    if config["save_info_test"] is not None:
        result = {}
        result["test_cost"] = test_cost
        result["test_accuracy"] = test_metrics
        result["infer_time"] = infer_time
        save_path = config["save_info_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        fp = open(save_path, "w")
        json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if config["save_result_test"] is not None:
        filename = config["save_result_test"]
        save_prediction(filename, prediction_data)
    if config["make_plot"]:
        plot_auc(config, all_data.labels, np.array(prediction_data))
    if "save_edge_result_test" in config and config[
            "save_edge_result_test"] is not None:
        output_data = model.output(all_data)
        pred_score = np.array(prediction_data)
        true_label = np.array(all_data.label_list)
        test_idx = all_data.test_data_idx
        score_list = []
        print(true_label.shape)
        for pair in true_label[0]:
            i1, _, j1, i2, _, j2 = pair
            s1 = pred_score[0, i1, j1]
            s2 = pred_score[0, i2, j2]
            score_list.append([s1, s2])
        fold = {}
        fold["output"] = output_data[0]
        fold["score"] = np.array(score_list)
        save_path = config["save_edge_result_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold, save_path, compress=True)
Esempio n. 12
0
File: gcn.py Progetto: 0h-n0/kGCN
def train_cv(sess, graph, config):
    from sklearn.model_selection import KFold, StratifiedKFold
    from kgcn.make_plots import make_auc_plot, make_cost_acc_plot
    import sklearn
    from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support
    from scipy import interp

    batch_size = config["batch_size"]
    learning_rate = config["learning_rate"]

    all_data, info = load_data(
        config, filename=config["dataset"],
        prohibit_shuffle=True)  # shuffle is done by KFold
    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])

    # Training
    if config["stratified_kfold"]:
        print("[INFO] use stratified K-fold")
        kf = StratifiedKFold(n_splits=config["k-fold_num"],
                             shuffle=config["shuffle_data"],
                             random_state=123)
    else:
        kf = KFold(n_splits=config["k-fold_num"],
                   shuffle=config["shuffle_data"],
                   random_state=123)

    kf_count = 1
    fold_data_list = []
    output_data_list = []
    if all_data["labels"] is not None:
        split_base = all_data["labels"]
    else:
        split_base = all_data["label_list"][0]
    if config["stratified_kfold"]:
        split_base = np.argmax(split_base, axis=1)
    score_metrics = []
    if config["task"] == "regression":
        metric_name = "mse"
    elif config["task"] == "regression_gmfe":
        metric_name = "gmfe"
    else:
        metric_name = "accuracy"
    split_data_generator = kf.split(
        split_base,
        split_base) if config["stratified_kfold"] else kf.split(split_base)
    for train_valid_list, test_list in split_data_generator:
        print("starting fold:{0}".format(kf_count))
        train_valid_data, test_data = split_data(
            all_data,
            indices_for_train_data=train_valid_list,
            indices_for_valid_data=test_list)

        train_data, valid_data = split_data(
            train_valid_data, valid_data_rate=config["validation_data_rate"])
        # Training
        print(train_valid_list)
        print(test_list)
        start_t = time.time()
        model.fit(train_data, valid_data, k_fold_num=kf_count)
        train_time = time.time() - start_t
        print("traing time:{0}".format(train_time) + "[sec]")
        # Test
        print("== valid data ==")
        start_t = time.time()
        valid_cost, valid_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print("final cost =", valid_cost)
        print("%s   =%f" % (metric_name, valid_metrics[metric_name]))
        print("infer time:{0}".format(infer_time) + "[sec]")

        print("== test data ==")
        start_t = time.time()
        test_cost, test_metrics, prediction_data = model.pred_and_eval(
            test_data)
        infer_time = time.time() - start_t
        print("final cost =", test_cost)
        print("%s   =%f" % (metric_name, test_metrics[metric_name]))
        score_metrics.append(test_metrics[metric_name])
        print("infer time:{0}".format(infer_time) + "[sec]")

        if config["export_model"]:
            try:
                name, ext = os.path.splitext(config["export_model"])
                filename = name + "." + str(kf_count) + ext
                print("[SAVE]", filename)
                graph_def = graph_util.convert_variables_to_constants(
                    sess, graph.as_graph_def(), ['output'])
                tf.train.write_graph(graph_def, '.', filename, as_text=False)
            except:
                print('[ERROR] output has been not found')
        if "save_edge_result_cv" in config:
            output_data = model.output(test_data)
            output_data_list.append(output_data)
        # save fold data
        fold_data = dotdict({})
        fold_data.prediction_data = prediction_data
        if all_data["labels"] is not None:
            fold_data.test_labels = test_data.labels
        else:
            fold_data.test_labels = test_data.label_list
        fold_data.test_data_idx = test_list
        if config["task"] == "regression":
            fold_data.training_mse = [
                el["training_mse"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_mse"] for el in model.validation_metrics_list
            ]
        elif config["task"] == "regression_gmfe":
            fold_data.training_mse = [
                el["training_gmfe"] for el in model.training_metrics_list
            ]
            fold_data.validation_mse = [
                el["validation_gmfe"] for el in model.validation_metrics_list
            ]
        else:
            fold_data.training_acc = [
                el["training_accuracy"] for el in model.training_metrics_list
            ]
            fold_data.validation_acc = [
                el["validation_accuracy"]
                for el in model.validation_metrics_list
            ]
        fold_data.test_acc = test_metrics[metric_name]
        fold_data.training_cost = model.training_cost_list
        fold_data.validation_cost = model.validation_cost_list
        fold_data.test_cost = test_cost
        fold_data.train_time = train_time
        fold_data.infer_time = infer_time
        fold_data_list.append(fold_data)
        kf_count += 1

    print("cv %s(mean)   =%f" % (metric_name, np.mean(score_metrics)))
    print("cv %s(std.)   =%f" % (metric_name, np.std(score_metrics)))
    if "save_info_cv" in config and config["save_info_cv"] is not None:
        save_path = config["save_info_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(fold_data_list, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold_data_list, save_path, compress=True)
    ##
    if "save_edge_result_cv" in config and config[
            "save_edge_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            true_label = np.array(fold_data.test_labels)
            test_idx = fold_data.test_data_idx
            score_list = []
            for pair in true_label[0]:
                i1, _, j1, i2, _, j2 = pair
                s1 = pred_score[0, i1, j1]
                s2 = pred_score[0, i2, j2]
                score_list.append([s1, s2])
            fold = {}
            fold["output"] = output_data_list[j][0]
            fold["score"] = np.array(score_list)
            fold["test_data_idx"] = test_idx
            result_cv.append(fold)
        save_path = config["save_edge_result_cv"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(result_cv, save_path, compress=True)
    #
    #
    if "save_result_cv" in config and config["save_result_cv"] is not None:
        result_cv = []
        for j, fold_data in enumerate(fold_data_list):
            pred_score = np.array(fold_data.prediction_data)
            if len(pred_score.shape) == 3:  # multi-label-multi-task
                # #data x # task x #class
                # => this program supports only 2 labels
                pred_score = pred_score[:, :, 1]
            true_label = np.array(fold_data.test_labels)
            # #data x # task x #class
            if len(pred_score.shape) == 1:
                pred_score = pred_score[:, np.newaxis]
            if len(true_label.shape) == 1:
                true_label = true_label[:, np.newaxis]
            v = []
            for i in range(info.label_dim):
                el = {}
                if config["task"] == "regression":
                    el["r2"] = sklearn.metrics.r2_score(
                        true_label[:, i], pred_score[:, i])
                    el["mse"] = sklearn.metrics.mean_squared_error(
                        true_label[:, i], pred_score[:, i])
                elif config["task"] == "regression_gmfe":
                    el["gmfe"] = np.exp(
                        np.mean(np.log(true_label[:, i] / pred_score[:, i])))
                else:
                    pred = np.zeros(pred_score.shape)
                    pred[pred_score > 0.5] = 1
                    fpr, tpr, _ = roc_curve(true_label[:, i],
                                            pred_score[:, i],
                                            pos_label=1)
                    roc_auc = auc(fpr, tpr)
                    acc = accuracy_score(true_label[:, i], pred[:, i])
                    scores = precision_recall_fscore_support(true_label[:, i],
                                                             pred[:, i],
                                                             average='binary')
                    el["auc"] = roc_auc
                    el["acc"] = acc
                    el["pre"] = scores[0]
                    el["rec"] = scores[1]
                    el["f"] = scores[2]
                    el["sup"] = scores[3]
                v.append(el)
            result_cv.append(v)
        save_path = config["save_result_cv"]
        print("[SAVE] ", save_path)
        fp = open(save_path, "w")
        json.dump(result_cv, fp, indent=4, cls=NumPyArangeEncoder)
    #
    for i, fold_data in enumerate(fold_data_list):
        prefix = "fold" + str(i) + "_"
        result_path = config["plot_path"]
        os.makedirs(result_path, exist_ok=True)
        if config["make_plot"]:
            if config["task"] == "regression":
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path + prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            elif config["task"] == "regression_gmfe":
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_mse,
                                   fold_data.validation_mse,
                                   result_path + prefix)
                pred_score = np.array(fold_data.prediction_data)
                plot_r2(config,
                        fold_data.test_labels,
                        pred_score,
                        prefix=prefix)
            else:
                # plot cost
                make_cost_acc_plot(fold_data.training_cost,
                                   fold_data.validation_cost,
                                   fold_data.training_acc,
                                   fold_data.validation_acc,
                                   result_path + prefix)
                # plot AUC
                pred_score = np.array(fold_data.prediction_data)
                plot_auc(config,
                         fold_data.test_labels,
                         pred_score,
                         prefix=prefix)
Esempio n. 13
0
def infer(sess, graph, config):
    from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support
    batch_size = config["batch_size"]
    model = importlib.import_module(config["model.py"])
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    all_data, info = load_data(config, filename=dataset_filename)

    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"], is_train=False)

    # Initialize session
    saver = tf.train.Saver()
    #sess.run(tf.global_variables_initializer())
    print("[LOAD]", config["load_model"])
    saver.restore(sess, config["load_model"])

    # Validation
    start_t = time.time()
    test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data)
    infer_time = time.time() - start_t
    print("final cost =", test_cost)
    print("accuracy   =", test_metrics["accuracy"])
    print("infer time:{0}".format(infer_time) + "[sec]")

    if config["save_info_test"] is not None:
        result = {}
        result["test_cost"] = test_cost
        result["test_accuracy"] = test_metrics
        result["infer_time"] = infer_time
        ##
        pred_score = np.array(prediction_data)
        if len(pred_score.shape) == 3:  # multi-label-multi-task
            # #data x # task x #class
            # => this program supports only 2 labels
            pred_score = pred_score[:, :, 1]
        true_label = np.array(all_data.labels)
        # #data x # task x #class
        if len(pred_score.shape) == 1:
            pred_score = pred_score[:, np.newaxis]
        if len(true_label.shape) == 1:
            true_label = true_label[:, np.newaxis]
        v = []
        for i in range(info.label_dim):
            el = {}
            if config["task"] == "regression":
                el["r2"] = sklearn.metrics.r2_score(true_label[:, i],
                                                    pred_score[:, i])
                el["mse"] = sklearn.metrics.mean_squared_error(
                    true_label[:, i], pred_score[:, i])
            elif config["task"] == "regression_gmfe":
                el["gmfe"] = np.exp(
                    np.mean(np.log(true_label[:, i] / pred_score[:, i])))
            else:
                pred = np.zeros(pred_score.shape)
                pred[pred_score > 0.5] = 1
                fpr, tpr, _ = roc_curve(true_label[:, i],
                                        pred_score[:, i],
                                        pos_label=1)
                roc_auc = auc(fpr, tpr)
                acc = accuracy_score(true_label[:, i], pred[:, i])
                scores = precision_recall_fscore_support(true_label[:, i],
                                                         pred[:, i],
                                                         average='binary')
                el["auc"] = roc_auc
                el["acc"] = acc
                el["pre"] = scores[0]
                el["rec"] = scores[1]
                el["f"] = scores[2]
                el["sup"] = scores[3]
            v.append(el)
        result["test_metrics"] = el
        ##
        save_path = config["save_info_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        fp = open(save_path, "w")
        json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if config["save_result_test"] is not None:
        filename = config["save_result_test"]
        save_prediction(filename, prediction_data)
    if config["make_plot"]:
        plot_auc(config, all_data.labels, np.array(prediction_data))
    if "save_edge_result_test" in config and config[
            "save_edge_result_test"] is not None:
        output_data = model.output(all_data)
        pred_score = np.array(prediction_data)
        true_label = np.array(all_data.label_list)
        test_idx = all_data.test_data_idx
        score_list = []
        print(true_label.shape)
        for pair in true_label[0]:
            i1, _, j1, i2, _, j2 = pair
            s1 = pred_score[0, i1, j1]
            s2 = pred_score[0, i2, j2]
            score_list.append([s1, s2])
        fold = {}
        fold["output"] = output_data[0]
        fold["score"] = np.array(score_list)
        save_path = config["save_edge_result_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print("[SAVE] ", save_path)
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            fp = open(save_path, "w")
            json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold, save_path, compress=True)
Esempio n. 14
0
def visualize(sess, config, args):
    from tensorflow.python import debug as tf_debug
    from kgcn.visualization import cal_feature_IG, cal_feature_IG_for_kg
    # 入力は1分子づつ
    batch_size = 1
    # 入力データから、全データの情報, 学習用データの情報, 検証用データの情報, および
    # グラフに関する情報を順に取得する
    all_data, info = load_data(config,
                               filename=config["dataset"],
                               prohibit_shuffle=True)

    model = importlib.import_module(config["model.py"])
    try:
        # emmbedingレイヤを使っているモデルの可視化。IGはemmbedingレイヤの出力を対象にして計算される。
        placeholders = model.build_placeholders(info,
                                                config,
                                                batch_size=batch_size,
                                                feed_embedded_layer=True)
    except:
        placeholders = model.build_placeholders(info,
                                                config,
                                                batch_size=batch_size)
    try:
        # emmbedingレイヤを使っているモデルの可視化。IGはemmbedingレイヤの出力を対象にして計算される。
        _model, prediction, _, _, _ = model.build_model(
            placeholders,
            info,
            config,
            batch_size=batch_size,
            feed_embedded_layer=True)
    except:
        _model, prediction, _, _, _ = model.build_model(placeholders,
                                                        info,
                                                        config,
                                                        batch_size=batch_size)
    #--- セッションの初期化
    saver = tf.train.Saver()
    #tf.compat.v1.logging.info("[LOAD]", config["load_model"])
    tf.logging.info("[LOAD]", config["load_model"])

    saver.restore(sess, config["load_model"])
    #--- integrated gradientsの計算
    if config['visualize_type'] == 'graph':
        cal_feature_IG(sess,
                       all_data,
                       placeholders,
                       info,
                       prediction,
                       args.ig_modal_target,
                       args.ig_label_target,
                       logger=tf.logging,
                       model=_model)
        #logger=tf.compat.v1.logging, model=_model)
    else:
        cal_feature_IG_for_kg(sess,
                              all_data,
                              placeholders,
                              info,
                              config,
                              prediction,
                              logger=tf.logging,
                              model=_model)
Esempio n. 15
0
def infer(sess, graph, config):
    dataset_filename = config["dataset"]
    if "dataset_test" in config:
        dataset_filename = config["dataset_test"]
    if "test_label_list" in config:
        config["label_list"] = config["test_label_list"]
    all_data, info = load_data(config,
                               filename=dataset_filename,
                               prohibit_shuffle=True)

    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"], is_train=False)

    metric_name = ("mse" if config["task"] == "regression" else "gmfe"
                   if config["task"] == "regression_gmfe" else "accuracy")

    # Initialize session
    restore_ckpt(sess, config["load_model"])

    # Validation
    start_t = time.time()
    test_cost, test_metrics, prediction_data = model.pred_and_eval(all_data)
    infer_time = time.time() - start_t
    print(f"final cost = {test_cost}\n"
          f"{metric_name} = {test_metrics[metric_name]}\n"
          f"infer time: {infer_time}[sec]\n")

    if config["save_info_test"] is not None:
        result = {}
        result["test_cost"] = test_cost
        result["test_accuracy"] = test_metrics
        result["infer_time"] = infer_time
        if config["task"] != "link_prediction":
            result["test_metrics"] = compute_metrics(config, info,
                                                     prediction_data,
                                                     all_data.labels)
        save_path = config["save_info_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        with open(save_path, "w") as fp:
            json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if config["save_result_test"] is not None:
        filename = config["save_result_test"]
        save_prediction(filename, prediction_data)
    if config["make_plot"]:
        if config["task"] == "regression":
            pred_score = np.array(prediction_data)
            plot_r2(config, all_data.labels, pred_score)
        elif config["task"] == "regression_gmfe":
            pred_score = np.array(prediction_data)
            plot_r2(config, all_data.labels, pred_score)
        elif config["task"] == "link_prediction":
            pass
        else:
            plot_auc(config, all_data.labels, np.array(prediction_data))

    if "save_edge_result_test" in config and config[
            "save_edge_result_test"] is not None:
        #output_left_pred = model.left_pred(all_data)
        #print(output_left_pred.shape)
        ##
        output_data = model.output(all_data)
        pred_score = np.array(prediction_data)
        true_label = np.array(all_data.label_list)
        score_list = []
        print(true_label.shape)
        for pair in true_label[0]:
            if len(prediction_data[0].shape) == 2:
                i1, _, j1, i2, _, j2 = pair
                s1 = pred_score[0, i1, j1]
                s2 = pred_score[0, i2, j2]
            elif len(prediction_data[0].shape) == 3:
                i1, r1, j1, i2, r2, j2 = pair
                s1 = pred_score[0, r1, i1, j1]
                s2 = pred_score[0, r2, i2, j2]
            score_list.append([s1, s2])
        fold = {}
        fold["output"] = output_data[0]
        fold["score"] = np.array(score_list)
        save_path = config["save_edge_result_test"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        _, ext = os.path.splitext(save_path)
        if ext == ".json":
            with open(save_path, "w") as fp:
                json.dump(fold, fp, indent=4, cls=NumPyArangeEncoder)
        else:
            joblib.dump(fold, save_path, compress=True)
    if config["prediction_data"] is not None:
        obj = {}
        obj["prediction_data"] = prediction_data
        obj["labels"] = all_data.labels

        os.makedirs(os.path.dirname(config["prediction_data"]), exist_ok=True)
        joblib.dump(obj, config["prediction_data"], compress=True)
Esempio n. 16
0
def train(sess, config):
    if config["validation_dataset"] is None:
        all_data, train_data, valid_data, info = load_and_split_data(
            config,
            filename=config["dataset"],
            valid_data_rate=config["validation_data_rate"])
    else:
        print("[INFO] training")
        train_data, info = load_data(config, filename=config["dataset"])
        print("[INFO] validation")
        valid_data, valid_info = load_data(
            config, filename=config["validation_dataset"])
        info["graph_node_num"] = max(info["graph_node_num"],
                                     valid_info["graph_node_num"])
        info["graph_num"] = info["graph_num"] + valid_info["graph_num"]
    # train model
    graph_index_list = []
    for i in range(info["graph_num"]):
        graph_index_list.append([i, i])
    info.graph_index_list = graph_index_list
    info.pos_weight = get_pos_weight(train_data)
    info.norm = get_norm(train_data)
    print(f"pos_weight={info.pos_weight}")
    print(f"norm={info.norm}")

    model = CoreModel(sess,
                      config,
                      info,
                      construct_feed_callback=construct_feed)
    load_model_py(model, config["model.py"])

    vars_to_train = tf.trainable_variables()
    for v in vars_to_train:
        print(v)

    # Training
    start_t = time.time()
    model.fit(train_data, valid_data)
    train_time = time.time() - start_t
    print(f"training time:{train_time}[sec]")
    # Validation
    start_t = time.time()
    validation_cost, validation_accuracy, validation_prediction_data = model.pred_and_eval(
        valid_data)
    training_cost, training_accuracy, training_prediction_data = model.pred_and_eval(
        train_data)
    infer_time = time.time() - start_t
    print(f"final cost(training  ) = {training_cost}\n"
          f"accuracy  (training  ) = {training_accuracy['accuracy']}\n"
          f"final cost(validation) = {validation_cost}\n"
          f"accuracy  (validation) = {validation_accuracy['accuracy']}\n"
          f"infer time:{infer_time}[sec]\n")
    # Saving
    if config["save_info_valid"] is not None:
        result = {}
        result["validation_cost"] = validation_cost
        result["validation_accuracy"] = validation_accuracy["accuracy"]
        result["train_time"] = train_time
        result["infer_time"] = infer_time
        save_path = config["save_info_valid"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        with open(save_path, "w") as fp:
            json.dump(result, fp, indent=4)

    if config["save_info_train"] is not None:
        result = {}
        result["test_cost"] = training_cost
        result["test_accuracy"] = training_accuracy["accuracy"]
        result["train_time"] = train_time
        save_path = config["save_info_train"]
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        print(f"[SAVE] {save_path}")
        with open(save_path, "w") as fp:
            json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if "reconstruction_valid" in config:
        filename = config["reconstruction_valid"]
        print(os.path.dirname(filename))
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        print(f"[SAVE] {filename}")
        joblib.dump(validation_prediction_data, filename)
    if "reconstruction_train" in config:
        filename = config["reconstruction_train"]
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        print(f"[SAVE] {filename}")
        joblib.dump(training_prediction_data, filename)
Esempio n. 17
0
def train(sess, graph, config):
    from sklearn.metrics import roc_curve, auc, accuracy_score, precision_recall_fscore_support
    batch_size = config["batch_size"]
    learning_rate = config["learning_rate"]

    if config["validation_dataset"] is None:
        _, train_data, valid_data, info = load_and_split_data(
            config,
            filename=config["dataset"],
            valid_data_rate=config["validation_data_rate"])
    else:
        print("[INFO] training")
        train_data, info = load_data(config, filename=config["dataset"])
        print("[INFO] validation")
        valid_data, valid_info = load_data(
            config, filename=config["validation_dataset"])
        info["graph_node_num"] = max(info["graph_node_num"],
                                     valid_info["graph_node_num"])
        info["graph_num"] = info["graph_num"] + valid_info["graph_num"]

    model = CoreModel(sess, config, info)
    load_model_py(model, config["model.py"])

    if config["profile"]:
        vars_to_train = tf.trainable_variables()
        print(vars_to_train)
        writer = tf.summary.FileWriter('logs', sess.graph)

    # Training
    start_t = time.time()
    model.fit(train_data, valid_data)
    train_time = time.time() - start_t
    print("traing time:{0}".format(train_time) + "[sec]")
    if valid_data.num > 0:
        # Validation
        start_t = time.time()
        validation_cost, validation_metrics, prediction_data = model.pred_and_eval(
            valid_data)
        infer_time = time.time() - start_t
        print("final cost =", validation_cost)
        print("accuracy   =", validation_metrics["accuracy"])
        print("validation time:{0}".format(infer_time) + "[sec]")
        # Saving
        if config["save_info_valid"] is not None:
            result = {}
            result["validation_cost"] = validation_cost
            result["validation_accuracy"] = validation_metrics
            result["train_time"] = train_time
            result["infer_time"] = infer_time
            ##
            pred_score = np.array(prediction_data)
            if len(pred_score.shape) == 3:  # multi-label-multi-task
                # #data x # task x #class
                # => this program supports only 2 labels
                pred_score = pred_score[:, :, 1]
            true_label = np.array(valid_data.labels)
            # #data x # task x #class
            if len(pred_score.shape) == 1:
                pred_score = pred_score[:, np.newaxis]
            if len(true_label.shape) == 1:
                true_label = true_label[:, np.newaxis]
            v = []
            for i in range(info.label_dim):
                el = {}
                if config["task"] == "regression":
                    el["r2"] = sklearn.metrics.r2_score(
                        true_label[:, i], pred_score[:, i])
                    el["mse"] = sklearn.metrics.mean_squared_error(
                        true_label[:, i], pred_score[:, i])
                elif config["task"] == "regression_gmfe":
                    el["gmfe"] = np.exp(
                        np.mean(np.log(true_label[:, i] / pred_score[:, i])))
                else:
                    pred = np.zeros(pred_score.shape)
                    pred[pred_score > 0.5] = 1
                    fpr, tpr, _ = roc_curve(true_label[:, i],
                                            pred_score[:, i],
                                            pos_label=1)
                    roc_auc = auc(fpr, tpr)
                    acc = accuracy_score(true_label[:, i], pred[:, i])
                    scores = precision_recall_fscore_support(true_label[:, i],
                                                             pred[:, i],
                                                             average='binary')
                    el["auc"] = roc_auc
                    el["acc"] = acc
                    el["pre"] = scores[0]
                    el["rec"] = scores[1]
                    el["f"] = scores[2]
                    el["sup"] = scores[3]
                v.append(el)
            result["valid_metrics"] = el
            ##
            save_path = config["save_info_valid"]
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            print("[SAVE] ", save_path)
            fp = open(save_path, "w")
            json.dump(result, fp, indent=4, cls=NumPyArangeEncoder)

    if config["export_model"]:
        try:
            print("[SAVE]", config["export_model"])
            graph_def = graph_util.convert_variables_to_constants(
                sess, graph.as_graph_def(), ['output'])
            tf.train.write_graph(graph_def,
                                 '.',
                                 config["export_model"],
                                 as_text=False)
        except:
            print('[ERROR] output has been not found')
    if config["save_result_valid"] is not None:
        filename = config["save_result_valid"]
        save_prediction(filename, prediction_data)
    if config["make_plot"]:
        plot_cost(config, valid_data, model)
        plot_auc(config, valid_data.labels, np.array(prediction_data))