Esempio n. 1
0
def main(csv_path,
         target_name,
         task='classification',
         model_name='tabnet',
         tb_log_location='./tflog',
         categorical_features=[],
         val_frac=0.25,
         test_frac=0.25,
         emb_size=1,
         feature_dim=128,
         output_dim=64,
         batch_size=512,
         virtual_batch_size=512,
         batch_momentum=0.7,
         gamma=1.5,
         n_steps=6,
         max_steps=25,
         lr=0.02,
         decay_every=500,
         lambda_sparsity=0.0001):

    all_data = pd.read_csv(csv_path)
    trainval_df, test_df = train_test_split(all_data,
                                            test_size=test_frac,
                                            stratify=all_data[target_name])
    val_frac_after_test_split = val_frac / (1 - test_frac)
    train_df, val_df = train_test_split(trainval_df,
                                        test_size=val_frac_after_test_split)

    dataset_info = prepare_dataset(all_data,
                                   categorical_features,
                                   target_name,
                                   task,
                                   embedding_dim=emb_size)

    # TabNet model
    tabnet = tabnet_model.TabNet(columns=dataset_info['feature_columns'],
                                 num_features=dataset_info['num_features'],
                                 feature_dim=feature_dim,
                                 output_dim=output_dim,
                                 num_decision_steps=n_steps,
                                 relaxation_factor=gamma,
                                 batch_momentum=batch_momentum,
                                 virtual_batch_size=virtual_batch_size,
                                 num_classes=dataset_info['num_classes'])

    label_column = target_name

    # Training parameters
    max_steps = max_steps
    display_step = 5
    val_step = 5
    save_step = 5
    init_localearning_rate = lr
    decay_every = decay_every
    decay_rate = 0.95
    batch_size = batch_size
    sparsity_loss_weight = lambda_sparsity
    gradient_thresh = 2000.0

    # Input sampling
    train_batch = pandas_input_fn(train_df,
                                  label_column,
                                  dataset_info,
                                  num_epochs=100000,
                                  shuffle=True,
                                  batch_size=batch_size,
                                  n_buffer=1)
    val_batch = pandas_input_fn(val_df,
                                label_column,
                                dataset_info,
                                num_epochs=10000,
                                shuffle=False,
                                batch_size=batch_size,
                                n_buffer=1)
    test_batch = pandas_input_fn(test_df,
                                 label_column,
                                 dataset_info,
                                 num_epochs=10000,
                                 shuffle=False,
                                 batch_size=batch_size,
                                 n_buffer=1)

    train_iter = train_batch.make_initializable_iterator()
    val_iter = val_batch.make_initializable_iterator()
    test_iter = test_batch.make_initializable_iterator()

    feature_train_batch, label_train_batch = train_iter.get_next()
    feature_val_batch, label_val_batch = val_iter.get_next()
    feature_test_batch, label_test_batch = test_iter.get_next()

    # Define the model and losses

    encoded_train_batch, total_entropy = tabnet.encoder(feature_train_batch,
                                                        is_training=True)

    if task == 'classification':

        logits_orig_batch, _ = tabnet.classify(encoded_train_batch)

        softmax_orig_key_op = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits_orig_batch, labels=label_train_batch))

        train_loss_op = softmax_orig_key_op + sparsity_loss_weight * total_entropy

    else:

        predictions = tabnet.regress(encoded_train_batch)

        #l2_loss = tf.reduce_mean(
        #    tf.nn.l2_loss(t = tf.subtract(predictions, label_train_batch)) / tf.to_float(tf.size(predictions))
        #)

        l2_loss = tf.reduce_mean(
            tf.square(tf.subtract(predictions, label_train_batch)))

        train_loss_op = l2_loss + sparsity_loss_weight * total_entropy

    tf.compat.v1.summary.scalar("Total loss", train_loss_op)

    # Optimization step
    global_step = tf.compat.v1.train.get_or_create_global_step()
    learning_rate = tf.compat.v1.train.exponential_decay(
        init_localearning_rate,
        global_step=global_step,
        decay_steps=decay_every,
        decay_rate=decay_rate)
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        gvs = optimizer.compute_gradients(train_loss_op)
        capped_gvs = [(tf.clip_by_value(grad, -gradient_thresh,
                                        gradient_thresh), var)
                      for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs,
                                             global_step=global_step)

    # Model evaluation

    # Validation performance
    encoded_val_batch, _ = tabnet.encoder(feature_val_batch, is_training=True)

    val_op = None

    if task == 'classification':
        _, prediction_val = tabnet.classify(encoded_val_batch)

        predicted_labels = tf.cast(tf.argmax(prediction_val, 1),
                                   dtype=tf.int32)
        val_eq_op = tf.equal(predicted_labels, label_val_batch)
        val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
        tf.compat.v1.summary.scalar("Val accuracy", val_acc_op)
        #val_op=tf.compat.v1.metrics.auc(label_val_batch, prediction_val[:,0])
        #val_op = val_acc_op
        val_op = result(
            tabnet.compile('sgd', loss='mse',
                           metrics=[tf.keras.metrics.AUC()]))

    else:
        predictions = tabnet.regress(encoded_val_batch)

        val_loss_op = tf.reduce_mean(
            tf.square(tf.subtract(predictions, label_train_batch)))

        val_op = val_loss_op
        tf.compat.v1.summary.scalar("Validation loss", val_loss_op)

    # Test performance

    encoded_test_batch, _ = tabnet.encoder(feature_test_batch,
                                           is_training=True)
    test_op = None

    if task == 'classification':
        _, prediction_test = tabnet.classify(encoded_test_batch)

        predicted_labels = tf.cast(tf.argmax(prediction_test, 1),
                                   dtype=tf.int32)
        #   test_eq_op = tf.equal(predicted_labels, label_test_batch)
        #   test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32))
        #   tf.compat.v1.summary.scalar("Test accuracy", test_acc_op)

        #    fpr, tpr, _ = roc_curve(label_test_batch, prediction_test)
        #test_op=tf.compat.v1.metrics.auc(label_test_batch, prediction_test[:,0])
        # test_op = test_acc_op
        test_op = result(
            tabnet.compile('sgd', loss='mse',
                           metrics=[tf.keras.metrics.AUC()]))

    else:
        predictions = tabnet.regress(encoded_test_batch)

        test_loss_op = tf.reduce_mean(
            tf.square(tf.subtract(predictions, label_test_batch)))

        tf.compat.v1.summary.scalar("Test loss", test_loss_op)
        test_op = test_loss_op

    # Training setup
    init = tf.initialize_all_variables()
    init_local = tf.compat.v1.local_variables_initializer()
    init_table = tf.compat.v1.tables_initializer(name="Initialize_all_tables")
    saver = tf.compat.v1.train.Saver()
    summaries = tf.compat.v1.summary.merge_all()

    with tf.compat.v1.Session() as sess:
        summary_writer = tf.compat.v1.summary.FileWriter(
            f'{tb_log_location}/{model_name}', sess.graph)

        sess.run(init)
        sess.run(init_local)
        sess.run(init_table)
        sess.run(train_iter.initializer)
        sess.run(val_iter.initializer)
        sess.run(test_iter.initializer)

        early_stop_steps = 25
        best_val_acc = -1

        for step in range(1, max_steps + 1):
            if step % display_step == 0:
                _, train_loss, merged_summary = sess.run(
                    [train_op, train_loss_op, summaries])
                summary_writer.add_summary(merged_summary, step)
                print("Step " + str(step) + " , Training Loss = " +
                      "{:.4f}".format(train_loss))
            else:
                _ = sess.run(train_op)

            if step % val_step == 0:
                feed_arr = [
                    vars()["summaries"],
                    vars()[f"val_op"],
                    vars()[f"test_op"]
                ]

                val_arr = sess.run(feed_arr)
                merged_summary = val_arr[0]
                val_acc = val_arr[1]
                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_val_step = step
                if (step - best_val_step) > early_stop_steps:
                    break

                print("Step " + str(step) + " , Val Metric = " +
                      "{:.4f}".format(val_acc))
                summary_writer.add_summary(merged_summary, step)

            if step % save_step == 0:
                saver.save(sess, "./checkpoints/" + model_name + ".ckpt")

        print(f'Best validation accuracy: {best_val_acc}')
Esempio n. 2
0
def train_and_evaluate(params,
                       batch_size,
                       virtual_batch_size,
                       max_steps,
                       lr,
                       decay_every,
                       target_name,
                       dataset_info,
                       train_df,
                       val_df):

  tf.compat.v1.reset_default_graph()
  print(params)

  # TabNet model
  tabnet = tabnet_model.TabNet(
      columns=dataset_info['feature_columns'],
      num_features=dataset_info['num_features'],
      feature_dim=int(params['n_a']),
      output_dim=int(params['n_a']), # Same dims for feature and output
      num_decision_steps=int(params['n_steps']),
      relaxation_factor=params['gamma'],
      batch_momentum=params['batch_momentum'],
      virtual_batch_size=virtual_batch_size,
      num_classes=dataset_info['num_classes'])

  label_column = target_name

  # Training parameters
  max_steps = max_steps
  display_step = 5
  val_step = 5
  init_localearning_rate = lr
  decay_every = decay_every
  decay_rate = 0.95
  batch_size = batch_size
  sparsity_loss_weight = params['lambda']
  gradient_thresh = 2000.0

  # Input sampling
  train_batch = pandas_input_fn(
      train_df,
      label_column,
      dataset_info,
      num_epochs=100000,
      shuffle=True,
      batch_size=batch_size,
      n_buffer=1)
  val_batch = pandas_input_fn(
      val_df,
      label_column,
      dataset_info,
      num_epochs=10000,
      shuffle=False,
      batch_size=batch_size,
      n_buffer=1)

  train_iter = train_batch.make_initializable_iterator()
  val_iter = val_batch.make_initializable_iterator()

  feature_train_batch, label_train_batch = train_iter.get_next()
  feature_val_batch, label_val_batch = val_iter.get_next()

  # Define the model and losses

  encoded_train_batch, total_entropy = tabnet.encoder(
      feature_train_batch, is_training=True)

  logits_orig_batch, _ = tabnet.classify(
      encoded_train_batch)

  softmax_orig_key_op = tf.reduce_mean(
      tf.nn.sparse_softmax_cross_entropy_with_logits(
          logits=logits_orig_batch, labels=label_train_batch))

  train_loss_op = softmax_orig_key_op + sparsity_loss_weight * total_entropy

  # Optimization step
  global_step = tf.compat.v1.train.get_or_create_global_step()
  learning_rate = tf.compat.v1.train.exponential_decay(
      init_localearning_rate,
      global_step=global_step,
      decay_steps=decay_every,
      decay_rate=decay_rate)
  optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
  update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    gvs = optimizer.compute_gradients(train_loss_op)
    capped_gvs = [(tf.clip_by_value(grad, -gradient_thresh,
                                    gradient_thresh), var) for grad, var in gvs]
    train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

  # Model evaluation

  # Validation performance
  encoded_val_batch, _ = tabnet.encoder(
      feature_val_batch, is_training=True)

  val_op = None
  _, prediction_val = tabnet.classify(
      encoded_val_batch)
  predicted_labels = tf.cast(tf.argmax(prediction_val, 1), dtype=tf.int32)
  val_eq_op = tf.equal(predicted_labels, label_val_batch)
  val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
  val_op = val_acc_op


  # Training setup
  init = tf.initialize_all_variables()
  init_local = tf.compat.v1.local_variables_initializer()
  init_table = tf.compat.v1.tables_initializer(name="Initialize_all_tables")
  summaries = tf.compat.v1.summary.merge_all()

  with tf.compat.v1.Session() as sess:
    sess.run(init)
    sess.run(init_local)
    sess.run(init_table)
    sess.run(train_iter.initializer)
    sess.run(val_iter.initializer)

    early_stop_steps = 25
    best_val_acc = -1

    for step in range(1, max_steps + 1):
      if step % display_step == 0:
        _, train_loss, merged_summary = sess.run(
            [train_op, train_loss_op, summaries])
      else:
        _ = sess.run(train_op)

      if step % val_step == 0:
        feed_arr = [
            vars()["summaries"],
            vars()[f"val_op"],
        ]

        val_arr = sess.run(feed_arr)
        merged_summary = val_arr[0]
        val_acc = val_arr[1]
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_val_step = step
        if (step - best_val_step) > early_stop_steps:
            break

    print(f'Best validation accuracy: {best_val_acc}')
    return -1*best_val_acc
def main(unused_argv):

    # Load training and eval data.

    train_file = "data/train.csv"
    val_file = "data/val.csv"
    test_file = "data/test.csv"

    # TabNet model
    tabnet_forest_covertype = tabnet_model.TabNet(
        columns=data_helper_covertype.get_columns(),
        num_features=data_helper_covertype.num_features,
        feature_dim=128,
        output_dim=64,
        num_decision_steps=6,
        relaxation_factor=1.5,
        batch_momentum=0.7,
        virtual_batch_size=512,
        num_classes=data_helper_covertype.num_classes)

    column_names = sorted(data_helper_covertype.feature_columns)
    print(
        "Ordered column names, corresponding to the indexing in Tensorboard visualization"
    )
    for fi in range(len(column_names)):
        print(str(fi) + " : " + column_names[fi])

    # Training parameters
    max_steps = 10
    display_step = 5
    val_step = 5
    save_step = 5
    init_localearning_rate = 0.02
    decay_every = 500
    decay_rate = 0.95
    batch_size = 512
    sparsity_loss_weight = 0.0001
    gradient_thresh = 2000.0

    # Input sampling
    train_batch = data_helper_covertype.input_fn(train_file,
                                                 num_epochs=100000,
                                                 shuffle=True,
                                                 batch_size=batch_size,
                                                 n_buffer=1,
                                                 n_parallel=1)
    val_batch = data_helper_covertype.input_fn(val_file,
                                               num_epochs=10000,
                                               shuffle=False,
                                               batch_size=batch_size,
                                               n_buffer=1,
                                               n_parallel=1)
    test_batch = data_helper_covertype.input_fn(test_file,
                                                num_epochs=10000,
                                                shuffle=False,
                                                batch_size=batch_size,
                                                n_buffer=1,
                                                n_parallel=1)

    train_iter = train_batch.make_initializable_iterator()
    val_iter = val_batch.make_initializable_iterator()
    test_iter = test_batch.make_initializable_iterator()

    feature_train_batch, label_train_batch = train_iter.get_next()
    feature_val_batch, label_val_batch = val_iter.get_next()
    feature_test_batch, label_test_batch = test_iter.get_next()

    # Define the model and losses

    encoded_train_batch, total_entropy = tabnet_forest_covertype.encoder(
        feature_train_batch, reuse=False, is_training=True)

    logits_orig_batch, _ = tabnet_forest_covertype.classify(
        encoded_train_batch, reuse=False)

    softmax_orig_key_op = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits_orig_batch, labels=label_train_batch))

    train_loss_op = softmax_orig_key_op + sparsity_loss_weight * total_entropy
    tf.summary.scalar("Total loss", train_loss_op)

    # Optimization step
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.exponential_decay(init_localearning_rate,
                                               global_step=global_step,
                                               decay_steps=decay_every,
                                               decay_rate=decay_rate)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        gvs = optimizer.compute_gradients(train_loss_op)
        capped_gvs = [(tf.clip_by_value(grad, -gradient_thresh,
                                        gradient_thresh), var)
                      for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs,
                                             global_step=global_step)

    # Model evaluation

    # Validation performance
    encoded_val_batch, _ = tabnet_forest_covertype.encoder(feature_val_batch,
                                                           reuse=True,
                                                           is_training=True)

    _, prediction_val = tabnet_forest_covertype.classify(encoded_val_batch,
                                                         reuse=True)

    predicted_labels = tf.cast(tf.argmax(prediction_val, 1), dtype=tf.int32)
    val_eq_op = tf.equal(predicted_labels, label_val_batch)
    val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
    tf.summary.scalar("Val accuracy", val_acc_op)

    # Test performance
    encoded_test_batch, _ = tabnet_forest_covertype.encoder(feature_test_batch,
                                                            reuse=True,
                                                            is_training=True)

    _, prediction_test = tabnet_forest_covertype.classify(encoded_test_batch,
                                                          reuse=True)

    predicted_labels = tf.cast(tf.argmax(prediction_test, 1), dtype=tf.int32)
    test_eq_op = tf.equal(predicted_labels, label_test_batch)
    test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32))
    tf.summary.scalar("Test accuracy", test_acc_op)

    # Training setup
    model_name = "tabnet_forest_covertype_model"
    init = tf.initialize_all_variables()
    init_local = tf.local_variables_initializer()
    init_table = tf.tables_initializer(name="Initialize_all_tables")
    saver = tf.train.Saver()
    summaries = tf.summary.merge_all()

    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter("./tflog/" + model_name,
                                               sess.graph)

        sess.run(init)
        sess.run(init_local)
        sess.run(init_table)
        sess.run(train_iter.initializer)
        sess.run(val_iter.initializer)
        sess.run(test_iter.initializer)

        for step in range(1, max_steps + 1):
            if step % display_step == 0:
                _, train_loss, merged_summary = sess.run(
                    [train_op, train_loss_op, summaries])
                summary_writer.add_summary(merged_summary, step)
                print("Step " + str(step) + " , Training Loss = " +
                      "{:.4f}".format(train_loss))
            else:
                _ = sess.run(train_op)

            if step % val_step == 0:
                feed_arr = [
                    vars()["summaries"],
                    vars()["val_acc_op"],
                    vars()["test_acc_op"]
                ]

                val_arr = sess.run(feed_arr)
                merged_summary = val_arr[0]
                val_acc = val_arr[1]

                print("Step " + str(step) + " , Val Accuracy = " +
                      "{:.4f}".format(val_acc))
                summary_writer.add_summary(merged_summary, step)

            if step % save_step == 0:
                saver.save(sess, "./checkpoints/" + model_name + ".ckpt")
def main(unused_argv):

    # Fix random seeds
    tf.set_random_seed(SEED)
    np.random.seed(SEED)

    # Define the TabNet model
    tabnet_forest_covertype = tabnet_model.TabNet(
        columns=data_helper_covertype.get_columns(),
        num_features=data_helper_covertype.NUM_FEATURES,
        feature_dim=4,
        output_dim=2,
        num_decision_steps=6,
        relaxation_factor=1.5,
        batch_momentum=0.7,
        virtual_batch_size=4,
        num_classes=data_helper_covertype.NUM_CLASSES)

    column_names = sorted(data_helper_covertype.FEATURE_COLUMNS)
    print(
        "Ordered column names, corresponding to the indexing in Tensorboard visualization"
    )
    for fi in range(len(column_names)):
        print(str(fi) + " : " + column_names[fi])

    # Input sampling
    train_batch = data_helper_covertype.input_fn(TRAIN_FILE,
                                                 num_epochs=100000,
                                                 shuffle=True,
                                                 batch_size=BATCH_SIZE)
    val_batch = data_helper_covertype.input_fn(
        VAL_FILE,
        num_epochs=10000,
        shuffle=False,
        batch_size=data_helper_covertype.N_VAL_SAMPLES)
    test_batch = data_helper_covertype.input_fn(
        TEST_FILE,
        num_epochs=10000,
        shuffle=False,
        batch_size=data_helper_covertype.N_TEST_SAMPLES)

    train_iter = train_batch.make_initializable_iterator()
    val_iter = val_batch.make_initializable_iterator()
    test_iter = test_batch.make_initializable_iterator()

    feature_train_batch, label_train_batch = train_iter.get_next()
    feature_val_batch, label_val_batch = val_iter.get_next()
    feature_test_batch, label_test_batch = test_iter.get_next()

    # Define the model and losses

    encoded_train_batch, total_entropy = tabnet_forest_covertype.encoder(
        feature_train_batch, reuse=False, is_training=True)

    logits_orig_batch, _ = tabnet_forest_covertype.classify(
        encoded_train_batch, reuse=False)

    softmax_orig_key_op = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits_orig_batch, labels=label_train_batch))

    train_loss_op = softmax_orig_key_op + SPARSITY_LOSS_WEIGHT * total_entropy
    tf.summary.scalar("Total loss", train_loss_op)

    # Optimization step
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.exponential_decay(INIT_LEARNING_RATE,
                                               global_step=global_step,
                                               decay_steps=DECAY_EVERY,
                                               decay_rate=DECAY_RATE)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        gvs = optimizer.compute_gradients(train_loss_op)
        capped_gvs = [(tf.clip_by_value(grad, -GRADIENT_THRESH,
                                        GRADIENT_THRESH), var)
                      for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs,
                                             global_step=global_step)

    # Model evaluation

    # Validation performance
    encoded_val_batch, _ = tabnet_forest_covertype.encoder(feature_val_batch,
                                                           reuse=True,
                                                           is_training=False)

    _, prediction_val = tabnet_forest_covertype.classify(encoded_val_batch,
                                                         reuse=True)

    predicted_labels = tf.cast(tf.argmax(prediction_val, 1), dtype=tf.int32)
    val_eq_op = tf.equal(predicted_labels, label_val_batch)
    val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
    tf.summary.scalar("Val accuracy", val_acc_op)

    # Test performance
    encoded_test_batch, _ = tabnet_forest_covertype.encoder(feature_test_batch,
                                                            reuse=True,
                                                            is_training=False)

    _, prediction_test = tabnet_forest_covertype.classify(encoded_test_batch,
                                                          reuse=True)

    predicted_labels = tf.cast(tf.argmax(prediction_test, 1), dtype=tf.int32)
    test_eq_op = tf.equal(predicted_labels, label_test_batch)
    test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32))
    tf.summary.scalar("Test accuracy", test_acc_op)

    # Training setup
    model_name = "tabnet_forest_covertype_model"
    init = tf.initialize_all_variables()
    init_local = tf.local_variables_initializer()
    init_table = tf.tables_initializer(name="Initialize_all_tables")
    saver = tf.train.Saver()
    summaries = tf.summary.merge_all()

    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter("./tflog/" + model_name,
                                               sess.graph)

        sess.run(init)
        sess.run(init_local)
        sess.run(init_table)
        sess.run(train_iter.initializer)
        sess.run(val_iter.initializer)
        sess.run(test_iter.initializer)

        for step in range(1, MAX_STEPS + 1):
            if step % DISPLAY_STEP == 0:
                _, train_loss, merged_summary = sess.run(
                    [train_op, train_loss_op, summaries])
                summary_writer.add_summary(merged_summary, step)
                print("Step " + str(step) + " , Training Loss = " +
                      "{:.4f}".format(train_loss))
            else:
                _ = sess.run(train_op)

            if step % VAL_STEP == 0:
                feed_arr = [
                    vars()["summaries"],
                    vars()["val_acc_op"],
                    vars()["test_acc_op"]
                ]

                val_arr = sess.run(feed_arr)
                merged_summary = val_arr[0]
                val_acc = val_arr[1]

                print("Step " + str(step) + " , Val Accuracy = " +
                      "{:.4f}".format(val_acc))
                summary_writer.add_summary(merged_summary, step)

            if step % SAVE_STEP == 0:
                saver.save(sess, "./checkpoints/" + model_name + ".ckpt")
Esempio n. 5
0
def main(csv_path, target_name, task='classification', model_name='tabnet', tb_log_location='./tflog',
         categorical_suffix=None, val_frac=0.25, test_frac=0.25,
         emb_size=1, feature_dim=128, output_dim=64,
         batch_size=512, virtual_batch_size=512, batch_momentum=0.7,
         gamma=1.5, n_steps=6, max_steps=25, lr=0.02, decay_every=500, lambda_sparsity=0.0001):
    all_data = pd.read_csv(csv_path)
    trainval_df, test_df = train_test_split(all_data, test_size=test_frac, stratify=all_data[target_name])
    val_frac_after_test_split = val_frac / (1 - test_frac)
    train_df, val_df = train_test_split(trainval_df, test_size=val_frac_after_test_split)

    print(f'Datasets sizes:')
    print(f'Train: {train_df.shape}:')
    print(f'Val: {val_df.shape}:')
    print(f'Test: {test_df.shape}:')

    # save data sets
    data_dir = os.path.dirname(csv_path)
    for subset, label in zip((train_df, val_df, test_df), ('train', 'val', 'test')):
        subset.to_csv(os.path.join(data_dir, f'{label}_set.csv'), index=False)

    dataset_info = prepare_dataset(all_data, categorical_suffix, target_name, task, embedding_dim=emb_size)

    # TabNet model
    tabnet = tabnet_model.TabNet(
        columns=dataset_info['feature_columns'],
        num_features=dataset_info['num_features'],
        feature_dim=feature_dim,
        output_dim=output_dim,
        num_decision_steps=n_steps,
        relaxation_factor=gamma,
        batch_momentum=batch_momentum,
        virtual_batch_size=virtual_batch_size,
        num_classes=dataset_info['num_classes'])

    label_column = target_name

    # Training parameters
    max_steps = max_steps
    display_step = 5
    val_step = 100 
    save_step = 100 
    init_localearning_rate = lr
    decay_every = decay_every
    decay_rate = 0.95
    batch_size = batch_size
    sparsity_loss_weight = lambda_sparsity
    gradient_thresh = 2000.0

    # Input sampling
    train_batch = pandas_input_fn(
        train_df,
        label_column,
        dataset_info,
        num_epochs=100000,
        shuffle=True,
        batch_size=batch_size,
        n_buffer=1)
    val_batch = pandas_input_fn(
        val_df,
        label_column,
        dataset_info,
        num_epochs=10000,
        shuffle=False,
        batch_size=batch_size,
        n_buffer=1)
    test_batch = pandas_input_fn(
        test_df,
        label_column,
        dataset_info,
        num_epochs=10000,
        shuffle=False,
        batch_size=batch_size,
        n_buffer=1)

    train_iter = train_batch.make_initializable_iterator()
    val_iter = val_batch.make_initializable_iterator()
    test_iter = test_batch.make_initializable_iterator()

    feature_train_batch, label_train_batch = train_iter.get_next()
    feature_val_batch, label_val_batch = val_iter.get_next()
    feature_test_batch, label_test_batch = test_iter.get_next()

    # Define the model and losses
    # for training
    encoded_train_batch, total_entropy = tabnet.encoder(
        feature_train_batch, is_training=True)


    if task == 'classification':
        
        logits_orig_batch, predictions_train = tabnet.classify(
            encoded_train_batch)


        softmax_orig_key_op = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits_orig_batch, labels=label_train_batch))


        train_loss_op = softmax_orig_key_op + sparsity_loss_weight * total_entropy

    else:

        predictions = tabnet.regress(
            encoded_train_batch
        )

        # l2_loss = tf.reduce_mean(
        #    tf.nn.l2_loss(t = tf.subtract(predictions, label_train_batch)) / tf.to_float(tf.size(predictions))
        # )

        l2_loss = tf.reduce_mean(tf.square(tf.subtract(predictions, label_train_batch)))

        train_loss_op = l2_loss + sparsity_loss_weight * total_entropy

    tf.compat.v1.summary.scalar("Total loss", train_loss_op)

    # Optimization step
    global_step = tf.compat.v1.train.get_or_create_global_step()
    learning_rate = tf.compat.v1.train.exponential_decay(
        init_localearning_rate,
        global_step=global_step,
        decay_steps=decay_every,
        decay_rate=decay_rate)
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        gvs = optimizer.compute_gradients(train_loss_op)
        capped_gvs = [(tf.clip_by_value(grad, -gradient_thresh,
                                        gradient_thresh), var) for grad, var in gvs]
        train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

    # Model evaluation

    # Validation performance
    encoded_val_batch, _ = tabnet.encoder(
        feature_val_batch, is_training=True)
    




    val_op = None

    if task == 'classification':
        _, predictions_val = tabnet.classify(
            encoded_val_batch)

        predicted_labels = tf.cast(tf.argmax(predictions_val, 1), dtype=tf.int32)
        val_eq_op = tf.equal(predicted_labels, label_val_batch)
        val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
        tf.compat.v1.summary.scalar("Val accuracy", val_acc_op)
        val_op = val_acc_op

    else:
        predictions = tabnet.regress(
            encoded_val_batch
        )

        val_loss_op = tf.reduce_mean(tf.square(tf.subtract(predictions, label_train_batch)))

        val_op = val_loss_op
        tf.compat.v1.summary.scalar("Validation loss", val_loss_op)

    # Test performance
    encoded_test_batch, _ = tabnet.encoder(
        feature_test_batch, is_training=True)
    test_op = None

    if task == 'classification':
        logits_test, predictions_test = tabnet.classify(
            encoded_test_batch)

        predicted_labels = tf.cast(tf.argmax(predictions_test, 1), dtype=tf.int32)
        test_eq_op = tf.equal(predicted_labels, label_test_batch)
        test_acc_op = -tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits_test, labels=label_test_batch))
        tf.compat.v1.summary.scalar("Test accuracy", test_acc_op)
        test_op = test_acc_op

    else:
        predictions = tabnet.regress(
            encoded_test_batch
        )

        test_loss_op = tf.reduce_mean(tf.square(tf.subtract(predictions, label_test_batch)))

        tf.compat.v1.summary.scalar("Test loss", test_loss_op)
        test_op = test_loss_op

    # Training setup
    init = tf.initialize_all_variables()
    init_local = tf.compat.v1.local_variables_initializer()
    init_table = tf.compat.v1.tables_initializer(name="Initialize_all_tables")
    saver = tf.compat.v1.train.Saver()
    summaries = tf.compat.v1.summary.merge_all()

    with tf.compat.v1.Session() as sess:
        summary_writer = tf.compat.v1.summary.FileWriter(f'{tb_log_location}/{model_name}', sess.graph)

        sess.run(init)
        sess.run(init_local)
        sess.run(init_table)
        sess.run(train_iter.initializer)
        sess.run(val_iter.initializer)
        sess.run(test_iter.initializer)

        early_stop_steps = 100
        best_val_acc = -1000.0

        for step in range(1, max_steps + 1):
            if step % display_step == 0:
                _, train_loss, merged_summary = sess.run(
                    [train_op, train_loss_op, summaries])
                summary_writer.add_summary(merged_summary, step)
                print("Step " + str(step) + " , Training Loss = " +
                      "{:.4f}".format(train_loss))
            else:
                _ = sess.run(train_op)

            if step % val_step == 0:
                feed_arr = [
                    vars()["summaries"],
                    vars()[f"val_op"],
                    vars()[f"test_op"]
                ]


                val_arr = sess.run(feed_arr)
                merged_summary = val_arr[0]
        
                # this computes acc on one batch, we want it on all batches
                # val_acc = val_arr[1]
                test_acc_list = []
                for i in range(len(test_df) // batch_size):
                    test_acc_list.append(sess.run(test_acc_op))
                val_acc = sum(test_acc_list) / len(test_acc_list)
                

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_val_step = step
                if (step - best_val_step) > early_stop_steps:
                    break

                print("Step " + str(step) + " , Val Metric = " +
                      "{:.4f}".format(val_acc))
                summary_writer.add_summary(merged_summary, step)

            if step % save_step == 0:
                saver.save(sess, "./checkpoints/" + model_name + ".ckpt")

        # validate on test and dump results
        print('Validating on test set ...')
        predictions_test_list = []
        labels_test_list = []
        test_acc_list = []
        for i in range(len(test_df) // batch_size):
            pr_test, label_test, test_acc = sess.run([predictions_test, label_test_batch, test_acc_op])
            predictions_test_list.append(pr_test)
            labels_test_list.append(label_test)
            test_acc_list.append(test_acc)
        
        predictions_test_concat = np.concatenate(predictions_test_list)
        labels_test_concat = np.concatenate(labels_test_list)
        avg_acc = sum(test_acc_list) / len(test_acc_list)
        print(f'Test predictions size: {predictions_test_concat.shape}')
        print(f'Test set size: {len(test_df)}')
        print(f'Did not preform inference on last {len(test_df) - predictions_test_concat.shape[0]} samples')
        print(f'Avg. accuracy on test set: {round(avg_acc, 5)}')
        print('Saving test predictions ...')
        pd.DataFrame(predictions_test_concat).to_csv(os.path.join(data_dir, 'test_pred.csv'), index=False)
        pd.DataFrame(labels_test_concat).to_csv(os.path.join(data_dir, 'test_labels.csv'), index=False)
        print(f'Best validation accuracy: {best_val_acc}')
Esempio n. 6
0
def main(unused_argv):

    # Fix random seeds
    tf.set_random_seed(SEED)
    np.random.seed(SEED)

    # Define the TabNet model
    tabnet_forest_covertype = tabnet_model.TabNet(
        columns=data_helper_covertype.get_columns(),
        num_features=data_helper_covertype.NUM_FEATURES,
        feature_dim=4,
        output_dim=2,
        num_decision_steps=6,
        relaxation_factor=1.5,
        batch_momentum=0.7,
        virtual_batch_size=4,
        num_classes=data_helper_covertype.NUM_CLASSES)

    column_names = sorted(data_helper_covertype.FEATURE_COLUMNS)
    print(
        "Ordered column names, corresponding to the indexing in Tensorboard visualization"
    )
    for fi in range(len(column_names)):
        print(str(fi) + " : " + column_names[fi])

    # Input sampling/ Разбитие на несколько выборок. Для тренировки, для обучения, валидации
    train_batch = data_helper_covertype.input_fn(TRAIN_FILE,
                                                 num_epochs=100000,
                                                 shuffle=True,
                                                 batch_size=BATCH_SIZE)
    val_batch = data_helper_covertype.input_fn(
        VAL_FILE,
        num_epochs=10000,
        shuffle=False,
        batch_size=data_helper_covertype.N_VAL_SAMPLES)
    test_batch = data_helper_covertype.input_fn(
        TEST_FILE,
        num_epochs=10000,
        shuffle=False,
        batch_size=data_helper_covertype.N_TEST_SAMPLES)

    # Создание итератора для последующего сохранения состояний.
    train_iter = train_batch.make_initializable_iterator()
    val_iter = val_batch.make_initializable_iterator()
    test_iter = test_batch.make_initializable_iterator()
    # Иициализация и запуск итератора под своими именами.
    feature_train_batch, label_train_batch = train_iter.get_next()
    feature_val_batch, label_val_batch = val_iter.get_next()
    feature_test_batch, label_test_batch = test_iter.get_next()

    # Define the model and losses
    # Прогнать обучающую выборку и результирующую по модели в прямом ходе
    encoded_train_batch, total_entropy = tabnet_forest_covertype.encoder(
        feature_train_batch, reuse=False, is_training=True)
    # Классифицировать результат на выходе
    logits_orig_batch, _ = tabnet_forest_covertype.classify(
        encoded_train_batch, reuse=False)
    # Эта функция вычисляет разреженную кросс-энтропию softmax между логитами и метками.
    # Другими словами, она измеряет вероятность ошибки в дискретных задачах классификации,
    # в которых классы являются взаимоисключающими.
    # Это означает, что каждый элемент данных принадлежит только одному классу.
    # Сверху все это усредняем.
    softmax_orig_key_op = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits_orig_batch, labels=label_train_batch))
    # Рассчитать ошибку на обучающих данных
    train_loss_op = softmax_orig_key_op + SPARSITY_LOSS_WEIGHT * total_entropy
    # Рассчитать сумму средних ошибок.
    tf.summary.scalar("Total loss", train_loss_op)

    # =======================Optimization step
    global_step = tf.train.get_or_create_global_step(
    )  # Returns and create (if necessary) the global step tensor.
    # При обучении модели часто рекомендуется снижать скорость обучения по мере продвижения обучения.
    # Эта функция применяет экспоненциальную функцию затухания к заданной начальной скорости обучения.
    learning_rate = tf.train.exponential_decay(INIT_LEARNING_RATE,
                                               global_step=global_step,
                                               decay_steps=DECAY_EVERY,
                                               decay_rate=DECAY_RATE)

    # Adam-алгоритм градиентной оптимизации
    # стохастических целевых функций первого порядка, основанный на адаптивных оценках моментов более низкого порядка
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    # The standard library uses various well-known names to collect and retrieve values associated with a graph
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    # иногда полезно знать какая версия значения переменной была использована в любой конкретной точке времени,
    # для того чтобы принудительно заставлять перечитывать значение переменной после того как что-то произошло
    with tf.control_dependencies(update_ops):
        # Вычислить градиенты
        gvs = optimizer.compute_gradients(train_loss_op)
        # Обрезает значения тензора до заданных значений min и max.
        capped_gvs = [(tf.clip_by_value(grad, -GRADIENT_THRESH,
                                        GRADIENT_THRESH), var)
                      for grad, var in gvs]
        # Применение обработанныx градиентoв
        train_op = optimizer.apply_gradients(capped_gvs,
                                             global_step=global_step)

    # ===================Model evaluation \ Оценка модели

    # ===================Validation performance
    # Прогнать валидационную выборку, указать, что это переиспользование и что не обучение.
    encoded_val_batch, _ = tabnet_forest_covertype.encoder(feature_val_batch,
                                                           reuse=True,
                                                           is_training=False)
    # Классификация результирующего набора.
    _, prediction_val = tabnet_forest_covertype.classify(encoded_val_batch,
                                                         reuse=True)

    # Возвращает индекс и приводит тензор к новому типу.
    predicted_labels = tf.cast(tf.argmax(prediction_val, 1), dtype=tf.int32)
    # Сверка значений.
    val_eq_op = tf.equal(predicted_labels, label_val_batch)
    # Вычисляет среднее значение элементов по измерениям тензора.
    val_acc_op = tf.reduce_mean(tf.cast(val_eq_op, dtype=tf.float32))
    # Сумма всех проверенных значений.
    tf.summary.scalar("Val accuracy", val_acc_op)

    # ===================Test performance
    # Прогнать тестовую выборку, указать, что не переиспользуется и что не обучение.
    encoded_test_batch, _ = tabnet_forest_covertype.encoder(feature_test_batch,
                                                            reuse=True,
                                                            is_training=False)
    # Классификация результирующего набора.
    _, prediction_test = tabnet_forest_covertype.classify(encoded_test_batch,
                                                          reuse=True)

    # Возвращает индекс и приводит тензор к новому типу.
    predicted_labels = tf.cast(tf.argmax(prediction_test, 1), dtype=tf.int32)
    # Сверка значений
    test_eq_op = tf.equal(predicted_labels, label_test_batch)
    # Вычисляет среднее значение элементов по измерениям тензора.
    test_acc_op = tf.reduce_mean(tf.cast(test_eq_op, dtype=tf.float32))
    # Сумма всех проверенных значений.
    tf.summary.scalar("Test accuracy", test_acc_op)

    # ===================Training setup
    # Наименование модели.
    model_name = "tabnet_forest_covertype_model"
    # Задание глобальных и локальных параметров.
    init = tf.initialize_all_variables()
    init_local = tf.local_variables_initializer()
    # Возвращает операцию, которая инициализирует все таблицы графика по умолчанию.
    init_table = tf.tables_initializer(name="Initialize_all_tables")
    # Сохранение всех значений переменных.
    saver = tf.train.Saver()
    # Объединяет все информацию, собранную по графу по умолчанию
    summaries = tf.summary.merge_all()

    # запись логов в заданную директорию
    with tf.Session() as sess:
        summary_writer = tf.summary.FileWriter("./tflog/" + model_name,
                                               sess.graph)

        # Этот метод выполняет один "шаг" вычисления TensorFlow, запуская необходимый фрагмент графа для выполнения
        # каждой операции и оценки каждого тензора в выборках,
        # подставляя значения в feed_dict для соответствующих входных значений.
        sess.run(init)
        sess.run(init_local)
        sess.run(init_table)
        # Учимся, Валидируем, Проверяем
        sess.run(train_iter.initializer)
        sess.run(val_iter.initializer)
        sess.run(test_iter.initializer)

        # Запуск записи результатов
        #
        for step in range(1, MAX_STEPS + 1):
            if step % DISPLAY_STEP == 0:
                _, train_loss, merged_summary = sess.run(
                    [train_op, train_loss_op, summaries])
                summary_writer.add_summary(
                    merged_summary, step
                )  # Запись результатов работы на каждом шаге кратный 0. Шаг, величина ошибки.
                print("Step " + str(step) + " , Training Loss = " +
                      "{:.4f}".format(train_loss))
            else:
                _ = sess.run(
                    train_op
                )  # Запуск расчета работы с оптимизированными градиентами.

            if step % VAL_STEP == 0:
                feed_arr = [
                    vars()["summaries"],
                    vars()["val_acc_op"],
                    vars()["test_acc_op"]
                ]
                # Запись валидационных результатов и результатов на тестовой выборке.
                val_arr = sess.run(feed_arr)
                merged_summary = val_arr[0]
                val_acc = val_arr[1]

                print("Step " + str(step) + " , Val Accuracy = " +
                      "{:.4f}".format(val_acc))
                summary_writer.add_summary(merged_summary, step)

                # сохранение точек и параметров.
            if step % SAVE_STEP == 0:
                saver.save(sess, "./checkpoints/" + model_name + ".ckpt")