Python TensorBoardDebugWrapperSessionの例

プログラミング言語: Python

名前空間/パッケージ名: tensorflow.python.debug

メソッド/関数: TensorBoardDebugWrapperSession

hotexamples.comのコード掲載数: 21

Python TensorBoardDebugWrapperSession - 21件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtensorflow.python.debug.TensorBoardDebugWrapperSessionの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: lstm_tf_model.py プロジェクト: rohanbaisantry/Various_LSTM_programs

def main():

    global sequence_length, batch_sizes

    sequence_length = int(input("\n\n enter a sequence length: "))

    x_train, y_train, x_test, y_test, max_test, min_test, max_train, min_train = load_data(
        "train_data.csv", "test_data.csv")
    n_train = len(x_train)
    n_test = len(x_test)

    # shuffling
    temp_train = list(zip(x_train, y_train))
    shuffle(temp_train)
    x_train, y_train = zip(*temp_train)
    temp_test = list(zip(x_test, y_test))
    shuffle(temp_test)
    x_test, y_test = zip(*temp_test)

    x_train, y_train, x_test, y_test = np.array(x_train), np.array(
        y_train), np.array(x_test), np.array(y_test)
    # x = ( n*sequence_length ) || y = ( n*1 )
    print("\n shape of the input: ", x_train.shape)
    print("\n shape of the output: ", y_train.shape)

    calculate_batch_sizes(n_train)
    batch_size = 1
    while batch_size not in batch_sizes:
        print("\n Choose one of the following batch sizes to be used \n",
              batch_sizes)
        batch_size = int(input("\n enter a batch size: "))

    tf.reset_default_graph()

    n_batches = int(len(y_train) / batch_size)
    print("\n number of batches: ", n_batches, "\n")
    x = tf.placeholder(tf.float32, [None, batch_size, sequence_length])
    x_batches = np.reshape(x_train, [n_batches, batch_size, sequence_length])
    y = tf.placeholder(tf.float32, [None, batch_size, 1])
    y_batches = np.reshape(y_train, [n_batches, batch_size, 1])

    params = dict()
    params["n_layers"] = 3
    params["neurons_layer1"] = sequence_length
    params["neurons_layer2"] = sequence_length * 3
    params["neurons_layer3"] = 1
    params["learning_rate"] = 0.01
    params["n_epochs"] = 4500
    params["optimizer"] = "AdamOptimizer"

    print("\n Parameters of the network are:\n")
    for key in params:
        print("\t", key, ": ", params[key])
    print("\n\n")

    my_lstm_cell = create_model(params)
    print("\n x's shape: ", x.shape)
    print("\n y's shape: ", y.shape, "\n")
    rnn_output, state = tf.nn.dynamic_rnn(cell=my_lstm_cell,
                                          inputs=x,
                                          dtype=tf.float32)
    print("\n shape of Network's output: ", rnn_output.shape, "\n")
    """
	stacked_rnn_output = tf.reshape(rnn_output, [-1, 1])
	stacked_outputs = tf.layers.dense(stacked_rnn_output, 1)
	outputs = tf.reshape(stacked_outputs, [n_train, 1, 1])
	"""

    #outputs = tf.reshape(rnn_output, [n_train, 1, 1])
    loss = tf.reduce_sum(tf.square(rnn_output - y))
    optimizer = tf.train.AdamOptimizer(learning_rate=params["learning_rate"])
    training_op = optimizer.minimize(loss)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        #sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        sess = tf_debug.TensorBoardDebugWrapperSession(sess, "rohanasus:6006")
        init.run()
        avg_loss = float(0)
        i = 0
        for ep in range(params["n_epochs"] + 1):
            sess.run(training_op, feed_dict={x: x_batches, y: y_batches})
            if ep % 100 == 0:
                i += 1
                mse = loss.eval(feed_dict={x: x_batches, y: y_batches})
                if ep != 0:
                    print("Epoch: ", ep, "\tLoss(MSE): ", mse)
                else:
                    print("Epoch: ", ep, "  \tLoss(MSE): ", mse)
                    avg_loss = float(avg_loss - mse)
                avg_loss = float(avg_loss + mse)
        print("\n Average loss while training: ", avg_loss)
        avg_loss = float(avg_loss / i)
        remove = x_test.shape[0] % batch_size
        x_test = x_test[:-remove]
        y_test = y_test[:-remove]
        print("\n x_test's shape: ", x_test.shape, "\n")
        print("\nPredicted value \tActal value\n")
        y_pred = sess.run(
            rnn_output,
            feed_dict={x: x_test.reshape(-1, batch_size, sequence_length)})
        temp_pred = np.reshape(np.array(y_pred), [-1, 1])
        avg_error = float(0)
        for i in range(len(y_test)):
            print("Predicted: ",
                  int(temp_pred[i][0] * (max_test - min_test) + min_test),
                  "\tActual: ",
                  int(y_test[i] * (max_test - min_test) + min_test))
            avg_error = avg_error + abs(temp_pred[i][0] -
                                        y_test[i]) * (max_test - min_test)
        avg_error = float(avg_error / len(y_test))
        accuracy = float(100 - avg_error)
        print("\n Average error while Testing: ", avg_error)
        print("\n Accuracy while testing: ", accuracy)
        all_tensors = [
            n.name for n in tf.get_default_graph().as_graph_def().node
        ]
        tf.train.write_graph(sess.graph_def, '.', 'hellotensor.pbtxt')

コード例 #2

ファイルを表示

ファイル: train_DAE.py プロジェクト: human2b/speech-driven-hand-gesture-generation-demo

def learning(data, data_info, just_restore=False):
    """ Training of the network

    Args:
        data:           dataset to train on
        data_info :     meta information about this dataset (such as variance, mean pose, etc.)
                        it is an object from the class DataInfo (defined at the top of this file)
        just_restore:   weather we are going to only restore the model from the checkpoint
                        or are we going to train it as well

    Returns:
        nn:             Neural Network trained on a data provided
    """

    test = False
    debug = False

    with tf.Graph().as_default():

        tf.set_random_seed(fl.FLAGS.seed)

        start_time = time.time()

        # Read the flags
        variance = fl.FLAGS.variance_of_noise
        num_hidden = fl.FLAGS.num_hidden_layers
        dropout = fl.FLAGS.dropout
        learning_rate = fl.FLAGS.learning_rate
        batch_size = fl.FLAGS.batch_size

        hidden_shapes = [fl.FLAGS.layer1_width for j in range(num_hidden)]

        # Check if the flags makes sence
        if dropout < 0 or variance < 0:
            print('ERROR! Have got negative values in the flags!')
            exit(1)

        # Allow TensorFlow to change device allocation when needed
        config = tf.ConfigProto(
            allow_soft_placement=True)  # log_device_placement=True)
        # Adjust configuration so that multiple executions are possible
        config.gpu_options.allow_growth = True

        # Start a session
        sess = tf.Session(config=config)

        if debug:
            sess = tf_debug.TensorBoardDebugWrapperSession(
                sess, "taras-All-Series:6064")

        # Create a neural network
        shape = [
            fl.FLAGS.frame_size * fl.FLAGS.chunk_length
        ] + hidden_shapes + [fl.FLAGS.frame_size * fl.FLAGS.chunk_length]
        nn = DAE(shape, sess, variance, data_info)
        print('\nDAE with the following shape was created : ', shape)

        # Initialize input_producer
        sess.run(tf.local_variables_initializer())

        max_val = nn.max_val

        with tf.variable_scope("Train"):

            ##############        DEFINE  Optimizer and training OPERATOR      ############

            # Define the optimizer
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)

            # Do gradient clipping
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(nn._loss, tvars),
                                              1e12)
            train_op = optimizer.apply_gradients(
                zip(grads, tvars),
                global_step=tf.train.get_or_create_global_step())

            # Prepare for making a summary for TensorBoard
            train_error = tf.placeholder(dtype=tf.float32,
                                         shape=(),
                                         name='train_error')
            eval_error = tf.placeholder(dtype=tf.float32,
                                        shape=(),
                                        name='eval_error')

            train_summary_op = tf.summary.scalar('Train_error', train_error)
            eval_summary_op = tf.summary.scalar('Validation_error', eval_error)

            summary_dir = fl.FLAGS.summary_dir
            summary_writer = tf.summary.FileWriter(
                summary_dir, graph=tf.get_default_graph())

            num_batches = int(data.train.num_sequences / batch_size)

            # Initialize the part of the graph with the input data
            sess.run(
                nn._train_data.initializer,
                feed_dict={nn._train_data_initializer: data.train.sequences})
            sess.run(
                nn._valid_data.initializer,
                feed_dict={nn._valid_data_initializer: data.test.sequences})

            # Start input enqueue threads.
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            if fl.FLAGS.pretrain:
                layers_amount = len(nn.shape) - 2

                # create an optimizers
                pretrain_optimizer = tf.train.AdamOptimizer(
                    learning_rate=learning_rate)

                # Make an array of the trainers for all the layers
                trainers = [
                    pretrain_optimizer.minimize(
                        ut.loss_reconstruction(
                            nn.run_less_layers(nn._input_, i + 1),
                            nn.run_less_layers(nn._input_,
                                               i + 1,
                                               is_target=True),
                            max_val,
                            pretrain=True),
                        global_step=tf.train.get_or_create_global_step(),
                        name='Layer_wise_optimizer_' + str(i))
                    for i in range(layers_amount)
                ]

                # Initialize all the variables
                sess.run(tf.global_variables_initializer())

            else:
                print("Initializing variables ...\n")
                sess.run(tf.global_variables_initializer())

            # Create a saver
            saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
            chkpt_file = fl.FLAGS.chkpt_dir + '/chkpt-final'

            # restore model, if needed
            if fl.FLAGS.restore:
                saver.restore(sess, chkpt_file)
                print("Model restored from the file " + str(chkpt_file) + '.')

            if just_restore:
                coord.request_stop()
                return nn

            # A few initialization for the early stopping
            delta = fl.FLAGS.delta_for_early_stopping  # error tolerance for early stopping
            best_error = 10000
            num_valid_batches = int(data.test.num_sequences / batch_size)

            try:  # running enqueue threads.

                # Pretrain
                if fl.FLAGS.pretrain:
                    layerwise_pretrain(nn, trainers, layers_amount,
                                       num_batches)

                # Train the whole network jointly
                step = 0
                print('\nFinetune the whole network on ', num_batches,
                      ' batches with ', batch_size,
                      ' training examples in each for',
                      fl.FLAGS.training_epochs, ' epochs...')
                print("")
                print(" ______________ ______")
                print("|     Epoch    | RMSE |")
                print("|------------  |------|")

                while not coord.should_stop():
                    _, train_error_ = sess.run(
                        [train_op, nn._reconstruction_loss], feed_dict={})

                    if step % num_batches == 0:
                        epoch = step * 1.0 / num_batches

                        train_summary = sess.run(
                            train_summary_op,
                            feed_dict={train_error: np.sqrt(train_error_)})

                        # Print results of screen
                        epoch_str = "| {0:3.0f} ".format(epoch)[:5]
                        perc_str = "({0:3.2f}".format(
                            epoch * 100.0 / fl.FLAGS.training_epochs)[:5]
                        error_str = "%) |{0:5.2f}".format(
                            train_error_)[:10] + "|"
                        print(epoch_str, perc_str, error_str)

                        if epoch % 5 == 0 and test:

                            rmse = test(nn,
                                        fl.FLAGS.data_dir + '/test_1.binary')
                            print(
                                "\nOur RMSE for the first test sequence is : ",
                                rmse)

                            rmse = test(nn,
                                        fl.FLAGS.data_dir + '/test_2.binary')
                            print(
                                "\nOur RMSE for the second test sequenceis : ",
                                rmse)

                        if epoch > 0:
                            summary_writer.add_summary(train_summary, step)

                            # Evaluate on the validation sequences
                            error_sum = 0
                            for valid_batch in range(num_valid_batches):
                                curr_err = sess.run([nn._valid_loss],
                                                    feed_dict={})
                                error_sum += curr_err[0]
                            new_error = error_sum / (num_valid_batches)
                            eval_sum = sess.run(
                                eval_summary_op,
                                feed_dict={eval_error: np.sqrt(new_error)})
                            summary_writer.add_summary(eval_sum, step)

                            # Early stopping
                            if fl.FLAGS.early_stopping:
                                if (new_error -
                                        best_error) / best_error > delta:
                                    print('After ' + str(step) +
                                          ' steps started overfitting')
                                    break
                                if new_error < best_error:
                                    best_error = new_error

                                    # Saver for the model
                                    save_path = saver.save(sess, chkpt_file)

                            if epoch % 5 == 0:
                                # Save for the model
                                save_path = saver.save(sess, chkpt_file)
                                print('Done training for %d epochs' % (epoch))
                                print("The model was saved in file: %s" %
                                      save_path)

                    step += 1

            except tf.errors.OutOfRangeError:
                if not fl.FLAGS.early_stopping:
                    # Save the model
                    save_path = saver.save(sess, chkpt_file)
                print('Done training for %d epochs, %d steps.' %
                      (fl.FLAGS.training_epochs, step))
                print("The final model was saved in file: %s" % save_path)
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()

            # Wait for threads to finish.
            coord.join(threads)

        duration = (time.time() -
                    start_time) / 60  # in minutes, instead of seconds

        print("The training was running for %.3f  min" % (duration))

        return nn

コード例 #3

ファイルを表示

def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir,
                                      one_hot=True,
                                      fake_data=FLAGS.fake_data)

    def feed_dict(train):
        if train or FLAGS.fake_data:
            xs, ys = mnist.train.next_batch(FLAGS.train_batch_size,
                                            fake_data=FLAGS.fake_data)
        else:
            xs, ys = mnist.test.images, mnist.test.labels

        return {x: xs, y_: ys}

    sess = tf.InteractiveSession()

    # Create the MNIST neural network graph.

    # Input placeholders.
    with tf.name_scope("input"):
        x = tf.placeholder(tf.float32, [None, IMAGE_SIZE * IMAGE_SIZE],
                           name="x-input")
        y_ = tf.placeholder(tf.float32, [None, NUM_LABELS], name="y-input")

    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1, seed=RAND_SEED)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def nn_layer(input_tensor,
                 input_dim,
                 output_dim,
                 layer_name,
                 act=tf.nn.relu):
        """Reusable code for making a simple neural net layer."""
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope("weights"):
                weights = weight_variable([input_dim, output_dim])
            with tf.name_scope("biases"):
                biases = bias_variable([output_dim])
            with tf.name_scope("Wx_plus_b"):
                preactivate = tf.matmul(input_tensor, weights) + biases

            activations = act(preactivate)
            return activations

    hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden")
    logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity)
    y = tf.nn.softmax(logits)

    with tf.name_scope("cross_entropy"):
        # The following line is the culprit of the bad numerical values that appear
        # during training of this graph. Log of zero gives inf, which is first seen
        # in the intermediate tensor "cross_entropy/Log:0" during the 4th run()
        # call. A multiplication of the inf values with zeros leads to nans,
        # which is first in "cross_entropy/mul:0".
        #
        # You can use the built-in, numerically-stable implementation to fix this
        # issue:
        #   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)

        diff = -(y_ * tf.log(y))
        with tf.name_scope("total"):
            cross_entropy = tf.reduce_mean(diff)

    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(
            FLAGS.learning_rate).minimize(cross_entropy)

    with tf.name_scope("accuracy"):
        with tf.name_scope("correct_prediction"):
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        with tf.name_scope("accuracy"):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    sess.run(tf.global_variables_initializer())

    if FLAGS.debug and FLAGS.tensorboard_debug_address:
        raise ValueError(
            "The --debug and --tensorboard_debug_address flags are mutually "
            "exclusive.")

    if FLAGS.debug:
        sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                    ui_type=FLAGS.ui_type)
    elif FLAGS.tensorboard_debug_address:
        sess = tf_debug.TensorBoardDebugWrapperSession(
            sess, FLAGS.tensorboard_debug_address)

    # Add this point, sess is a debug wrapper around the actual Session if
    # FLAGS.debug is true. In that case, calling run() will launch the CLI.
    for i in range(FLAGS.max_steps):
        acc = sess.run(accuracy, feed_dict=feed_dict(False))
        print("Accuracy at step %d: %s" % (i, acc))

        sess.run(train_step, feed_dict=feed_dict(True))

コード例 #4

ファイルを表示

ファイル: generate.py プロジェクト: zhang-jian/parallel-wavenet-vocoder

    gt_wav_op, melspec = iterator.get_next()

    # feed forward
    pred_wav_op = model(gt_wav_op, melspec, is_training=False)

    # summaries
    tf.summary.audio('audio/pred', pred_wav_op, hp.signal.sr)
    tf.summary.audio('audio/gt', gt_wav_op, hp.signal.sr)
    # tf.summary.histogram('hist/wav', gt_wav)
    # tf.summary.histogram('hist/out', pred_wav)
    summ_op = tf.summary.merge_all()

    session_config = tf.ConfigProto(device_count={'CPU': 1, 'GPU': 1}, )
    with tf.Session(config=session_config) as sess:
        if debug:  # session supporting tensorboard debugging.
            sess = tf_debug.TensorBoardDebugWrapperSession(
                sess, 'localhost:{}'.format(hp.debug_port))

        # load model
        ckpt = '{}/{}'.format(
            hp.logdir, ckpt) if ckpt else tf.train.latest_checkpoint(hp.logdir)
        sess.run(tf.global_variables_initializer())
        if ckpt:
            var_list = None
            if hp.train.use_ema:
                var_list = {}
                for v in tf.trainable_variables('iaf_vocoder'):
                    var_list[model.ema.average_name(v)] = v
            tf.train.Saver(var_list=var_list).restore(sess, ckpt)
            print('Successfully loaded checkpoint {}'.format(ckpt))
        else:
            print('No checkpoint found at {}.'.format(hp.logdir))

コード例 #5

ファイルを表示

    def run_model(
        self,
        model: modeling.CANTRIPModel,
        train: Cohort,
        devel: Cohort,
        test: Cohort,
        weights: typing.Union[float, int,
                              typing.Sequence[typing.Union[float, int]]] = 1):
        """
        Run the given model using the given cohort and experimental settings contained in args.

        This function:
        (1) balanced the dataset
        (2) splits the cohort intro training:development:testing sets at the patient-level
        (3) trains CANTRIP and saves checkpoint/summaries for TensorBoard
        (4) evaluates CANTRIP on the development and testing set
        :param model: an instantiated CANTRIP model
        :param train: the cohort to use for training this experimental run
        :param devel: the cohort to use for validating this experimental run
        :param test: the cohort to use for testing this experimental run
        :param weights: sample weights
        :return: nothing
        """
        # Save summaries and checkpoints into the directories passed to the script
        model_summaries_dir, model_checkpoint_path = self.get_model_file()

        # Clear any previous summaries/checkpoints if asked
        if FLAGS.clear_prev:
            nio.delete_dir_quiet(model_summaries_dir)
            nio.delete_dir_quiet(os.path.dirname(model_checkpoint_path))
            print('Deleted previous model summaries/checkpoints')

        # Make output directories so we don't blow up when saving
        nio.make_dirs_quiet(os.path.dirname(model_checkpoint_path))

        devel_batches = devel.batched(batch_size=FLAGS.batch_size,
                                      permute=False)
        test_batches = test.batched(batch_size=FLAGS.batch_size, permute=False)

        epoch_steps = len(train.to_list()) // FLAGS.batch_size

        optimizer = optimization.BERTOptimizer(
            model,
            lr_decay=True,
            l1_reg=FLAGS.use_l1_reg,
            l2_reg=FLAGS.use_l2_reg,
            num_train_steps=epoch_steps * 10,
            steps_per_epoch=epoch_steps,
            num_warmup_steps=epoch_steps * min(3, FLAGS.num_epochs - 1),
            init_lr=FLAGS.learning_rate,
            weights=weights,
            normalize_weights=FLAGS.use_focal_loss,
            focal_loss=FLAGS.use_focal_loss)

        summarizer = summarization.CANTRIPSummarizer(model, optimizer)

        # Now that everything has been defined in TensorFlow's computation graph, initialize our model saver
        saver = tf.train.Saver(tf.global_variables(),
                               max_to_keep=FLAGS.max_to_keep)

        batch_width = int(np.log10(FLAGS.batch_size)) + 1
        count_format = '%0' + str(batch_width) + 'd'
        score_format = '%5.3f'

        # noinspection PyCompatibility
        metric_format = {
            'TP': count_format,
            'TN': count_format,
            'FP': count_format,
            'FN': count_format,
            'Precision': score_format,
            'Recall': score_format,
            'Accuracy': score_format,
            'Specificity': score_format,
            'DOR': '%5.1f',
            'F1': score_format,
            'F2': score_format,
            'F.5': score_format,
            'AUROC': score_format,
            'AUPRC': score_format,
            'Loss': score_format,
            'MCC': score_format,
        }

        log_metrics = {
            "Accuracy": "Acc",
            "AUROC": "AUROC",
            "AUPRC": "AUPRC",
            "Precision": "Prec",
            "Recall": "Sens",
            "Specificity": "Spec",
            "DOR": "OR",
            "F1": "F1",
            "MCC": "MCC",
            "Loss": "Loss"
        }

        def format_(results, metrics=None):
            if not metrics:
                metrics = {k: k for k in metric_format.keys()}
            return {
                metrics[metric]: (metric_format[metric] % value)
                for metric, value in results.items() if metric in metrics
            }

        # Tell TensorFlow to wake up and get ready to rumble
        with tf.Session() as sess:

            # If we specified a TensorBoard debug server, connect to it
            # (this is actually pretty sweet but you have to manually step through your model's flow so 99% of the time
            # you shouldn't need it)
            if FLAGS.debug is not None:
                sess = tf_debug.TensorBoardDebugWrapperSession(
                    sess, FLAGS.debug)

            # Create our summary writer (used by TensorBoard)
            summary_writer = tf.summary.FileWriter(model_summaries_dir,
                                                   sess.graph)

            # Restore model if it exists (and we didn't clear it), otherwise create a shiny new one
            checkpoint = tf.train.get_checkpoint_state(model_checkpoint_path)
            if checkpoint and gfile.Exists(checkpoint.model_checkpoint_path +
                                           '.index'):
                print("Reading model parameters from '%s'...",
                      checkpoint.model_checkpoint_path)
                saver.restore(sess, checkpoint.model_checkpoint_path)
            else:
                print("Creating model with fresh parameters...")
                sess.run(tf.global_variables_initializer())

            # Initialize local variables (these are just used for computing average metrics)
            sess.run(tf.local_variables_initializer())

            # Create a progress logger to monitor training (this is a wrapped version of range()
            epoch_width = int(np.log10(FLAGS.num_epochs)) + 1
            with trange(FLAGS.num_epochs, desc='Training') as train_log:
                # Save the training, development, and testing metrics for our best model (as measured by devel F1)
                # I'm lazy so I initialize best_devel_metrics with a zero F1 so I can compare the first iteration to it
                best_train_metrics, best_devel_metrics = {}, {'MCC': 0}

                # Iterate over training epochs
                for i in train_log:
                    # Get global step and reset training metrics
                    global_step, _ = sess.run(
                        [optimizer.global_step, summarizer.train.reset_op])
                    total_loss = 0.

                    if FLAGS.correct_imbalance == "downsample" or FLAGS.correct_imbalance == "upsample":
                        train_ = train.balance_classes(
                            method=FLAGS.correct_imbalance)
                    else:
                        train_ = train

                    batches = train_.batched(batch_size=FLAGS.batch_size)
                    num_batches = len(batches)
                    with tqdm(batches,
                              desc=('Epoch %0' + str(epoch_width) + 'd') %
                              (i + 1)) as batch_log:
                        # Iterate over each batch
                        for j, batch in enumerate(batch_log):
                            # We train the model by evaluating the optimizer's training op. At the same time we update
                            # the training metrics and get metrics/summaries for the current batch and request the new
                            # global step number (used by TensorBoard to coordinate metrics across different runs
                            _, batch_summary, batch_metrics, global_step = sess.run(
                                [
                                    [
                                        optimizer.train_op,
                                        summarizer.train.metric_ops
                                    ],
                                    # All fetches we aren't going to read
                                    summarizer.batch_summary,
                                    summarizer.batch_metrics,
                                    optimizer.global_step
                                ],
                                batch.feed(model, training=True))

                            # Update tqdm progress indicator with current training metrics on this batch
                            batch_log.set_postfix(format_(batch_metrics))

                            # Save batch-level summaries
                            summary_writer.add_summary(batch_summary,
                                                       global_step=global_step)

                            total_loss += batch_metrics['Loss']

                    # Save epoch-level training metrics and summaries
                    train_metrics, train_summary = sess.run(
                        [summarizer.train.metrics, summarizer.train.summary])
                    train_metrics['Loss'] = total_loss / num_batches
                    summary_writer.add_summary(train_summary,
                                               global_step=global_step)

                    # Evaluate development performance
                    sess.run(summarizer.devel.reset_op)
                    # Update local variables used to compute development metrics as we process each batch
                    for devel_batch in devel_batches:
                        sess.run([summarizer.devel.metric_ops],
                                 devel_batch.feed(model, training=False))
                    # Compute the development metrics
                    devel_metrics, devel_summary = sess.run(
                        [summarizer.devel.metrics, summarizer.devel.summary])
                    # Update training progress bar to indicate current performance on development set
                    train_log.set_postfix(format_(devel_metrics))
                    # Save TensorBoard summary
                    summary_writer.add_summary(devel_summary,
                                               global_step=global_step)

                    # def format_metrics(metrics: dict):
                    #     return dict((key, '%6.4f' % value) for key, value in metrics.items())
                    train_log.write(
                        ('Epoch %0' + str(epoch_width) +
                         'd. Train: %s | Devel: %s') %
                        (i + 1,
                         "; ".join("{}: {}".format(k, v) for k, v in format_(
                             train_metrics, log_metrics).items()), "; ".join(
                                 "{}: {}".format(k, v) for k, v in format_(
                                     devel_metrics, log_metrics).items())))

                    sess.run(summarizer.test.reset_op)
                    for batch in test_batches:
                        sess.run([
                            summarizer.test.metrics, summarizer.test.metric_ops
                        ], batch.feed(model, training=False))
                    test_metrics, test_summary = sess.run(
                        [summarizer.test.metrics, summarizer.test.summary])
                    summary_writer.add_summary(test_summary,
                                               global_step=global_step)

                    # If this run did better on the dev set, save it as the new best model
                    if devel_metrics['MCC'] > best_devel_metrics['MCC']:
                        best_devel_metrics = devel_metrics
                        best_train_metrics = train_metrics
                        best_test_metrics = test_metrics

                        # Save the model
                        saver.save(sess,
                                   model_checkpoint_path,
                                   global_step=global_step)
            print('Training complete!')
            return model, best_train_metrics, best_devel_metrics, best_test_metrics

コード例 #6

ファイルを表示

ファイル: pyRef_train.py プロジェクト: igorgad/pyRefNet

def start_training(trainParams):
    with tf.Graph().as_default() as graph:

        tf.set_random_seed(2)

        keepp_pl = tf.placeholder(tf.float32)
        train_test_selector = tf.placeholder(tf.int32)
        dataset_handle = tf.placeholder(tf.string, shape=[])
        global_step = tf.Variable(0, name='global_step', trainable=False)

        with tf.device('/cpu:0'):
            examples, train_iterator, test_iterator = dataset_interface.add_defaul_dataset_pipeline(
                trainParams, model, dataset_handle)

        ins = examples[0]
        lbs = examples[1]
        typecombs = examples[2]
        instcombs = examples[3]
        genres = examples[4]
        ids = examples[5]
        audiofiles = examples[6]

        logits, rkhs = model.inference(ins, keepp_pl)
        loss = model.loss(logits, lbs)
        train_op = model.training(loss, global_step)
        eval_top1, eval_top5, correct1, correct5 = model.evaluation(
            logits, lbs)

        with tf.device('/cpu:0'):
            avg_loss_op, avg_top1_op, avg_top5_op, reset_op = stats.add_summaries(
                loss, eval_top1, eval_top5)
            update_comb_stats, reset_comb_stats = stats.add_comb_stats(
                correct1, correct5, typecombs, train_test_selector)
            update_inst_stats, reset_inst_stats = stats.add_inst_stats(
                correct1, correct5, instcombs, train_test_selector)
            update_genre_stats, reset_genre_stats = stats.add_genre_stats(
                correct1, correct5, genres, train_test_selector)
            stats.add_confusion_matrix(logits, lbs)
            stats.collect_wrong_examples(correct1, ins, rkhs, instcombs,
                                         typecombs, ids, audiofiles)

            reset_all = [reset_op, reset_comb_stats, reset_genre_stats]
            update_stats = [
                update_comb_stats, update_inst_stats, update_genre_stats
            ]

        summary = tf.summary.merge_all()

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        sess = tf.Session(config=config)
        if trainParams.debug:
            sess = tf_debug.TensorBoardDebugWrapperSession(
                sess, 'localhost:6064')

        train_writer = tf.summary.FileWriter(
            trainParams.log_path_dir + '/train', sess.graph)
        test_writer = tf.summary.FileWriter(trainParams.log_path_dir + '/test')

        training_handle = sess.run(train_iterator.string_handle())
        testing_handle = sess.run(test_iterator.string_handle())

        hparams_op = add_hyperparameters_textsum(trainParams)

        # Initialize or load graph from checkpoint
        if not trainParams.restore_from_dir:
            tf.gfile.MakeDirs(trainParams.log_path_dir)
            _, hp_str = sess.run([init, hparams_op])
            train_writer.add_summary(hp_str, 0)
            train_writer.flush()
        else:
            ckpt = tf.train.get_checkpoint_state(
                trainParams.restore_from_dir[0])
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)
            print('loaded graph from dir %s' % trainParams.restore_from_dir[0])

        graph.finalize()
        gstep = 0
        try:
            print('running...')

            sess.run(train_iterator.initializer)
            sess.run(test_iterator.initializer)
            # Start the training loop.
            while gstep < trainParams.num_steps:
                try:
                    # Train
                    sess.run(reset_op)
                    if trainParams.trace:
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()

                        _, loss_value, top1_value, top5_value, __, gstep = sess.run(
                            [
                                train_op, avg_loss_op, avg_top1_op,
                                avg_top5_op, update_stats, global_step
                            ],
                            feed_dict={
                                dataset_handle: training_handle,
                                train_test_selector: 0,
                                keepp_pl: model.kp
                            },
                            options=run_options,
                            run_metadata=run_metadata)

                        train_writer.add_run_metadata(run_metadata,
                                                      'stats_epoch %d' % gstep)
                        train_writer.flush()

                        print(
                            '%s: TRAIN step %d. %0.2f hz loss: %0.04f top1 %0.04f top5 %0.04f'
                            % (trainParams.run_name, gstep, 0.0, loss_value,
                               top1_value, top5_value))

                    duration_mean = 1
                    while True:
                        try:
                            start_time = time.time()

                            # Log training runtime statistics
                            if np.mod(gstep + 1,
                                      trainParams.summary_interval) == 0:
                                summary_str, _, loss_value, top1_value, top5_value, __, gstep = sess.run(
                                    [
                                        summary, train_op, avg_loss_op,
                                        avg_top1_op, avg_top5_op, update_stats,
                                        global_step
                                    ],
                                    feed_dict={
                                        dataset_handle: training_handle,
                                        train_test_selector: 0,
                                        keepp_pl: model.kp
                                    })

                                train_writer.add_summary(summary_str, gstep)
                                train_writer.flush()

                                print(
                                    '%s: TRAIN step %d. %0.2f hz loss: %0.04f top1 %0.04f top5 %0.04f'
                                    % (trainParams.run_name, gstep,
                                       model.batch_size / duration_mean,
                                       loss_value, top1_value, top5_value))

                                tt = []
                                sess.run([reset_op])
                            else:
                                _, loss_value, top1_value, top5_value, __, gstep = sess.run(
                                    [
                                        train_op, avg_loss_op, avg_top1_op,
                                        avg_top5_op, update_stats, global_step
                                    ],
                                    feed_dict={
                                        dataset_handle: training_handle,
                                        train_test_selector: 0,
                                        keepp_pl: model.kp
                                    })

                            duration_mean = (duration_mean +
                                             (time.time() - start_time)) / 2

                        except tf.errors.OutOfRangeError:
                            sess.run(train_iterator.initializer)
                            break

                    # Evaluate
                    duration_mean = 1
                    sess.run([reset_op])
                    while True:
                        try:
                            start_time = time.time()

                            loss_value, top1_value, top5_value, _, gstep = sess.run(
                                [
                                    avg_loss_op, avg_top1_op, avg_top5_op,
                                    update_stats, global_step
                                ],
                                feed_dict={
                                    dataset_handle: testing_handle,
                                    train_test_selector: 1,
                                    keepp_pl: 1
                                })

                            duration_mean = (duration_mean +
                                             (time.time() - start_time)) / 2

                        except tf.errors.OutOfRangeError:
                            sess.run(test_iterator.initializer)
                            break

                    summary_str, loss_value, top1_value, top5_value, _, gstep = sess.run(
                        [
                            summary, avg_loss_op, avg_top1_op, avg_top5_op,
                            update_stats, global_step
                        ],
                        feed_dict={
                            dataset_handle: testing_handle,
                            train_test_selector: 1,
                            keepp_pl: 1
                        })

                    test_writer.add_summary(summary_str, gstep)
                    test_writer.flush()

                    print(
                        '%s: TEST step %d. %0.2f hz. loss: %0.04f. top1 %0.04f. top5 %0.04f'
                        % (trainParams.run_name, gstep, model.batch_size /
                           duration_mean, loss_value, top1_value, top5_value))

                    # Save a checkpoint
                    checkpoint_file = os.path.join(trainParams.log_path_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_file, global_step=gstep)

                except Exception as e:
                    print('Received expection while training: ' + str(e))
                    sess.close()
                    return

                    # os.system('sudo sh -c "sync; echo 1 > /proc/sys/vm/drop_caches"')
                    # sess.run(train_iterator.initializer)
                    # sess.run(test_iterator.initializer)

                    # ckpt = tf.train.get_checkpoint_state(trainParams.restore_from_dir[0])
                    # if ckpt and ckpt.model_checkpoint_path:
                    #     saver.restore(sess, ckpt.model_checkpoint_path)
                    # print('loaded graph from dir %s' % trainParams.restore_from_dir[0])

        except Exception as e:
            print('finishing...' + str(e))
            sess.close()
            return

コード例 #7

ファイルを表示

# In[2]:

#creating a session object which creates an environment where we can execute Operations and evaluate Tensors
sess = tf.Session()

# ## Debugger
#
# ### Uncomment the below line and execute the code to run the debugger.
#
# ### Go to the link once you start execution    			http://localhost:6006/

# In[3]:

#Uncomment the below line to run the debugger
sess = tf_debug.TensorBoardDebugWrapperSession(sess, "localhost:6064")

# In[4]:

#Inserting a placeholder for a tensor equal to size of data
X = tf.placeholder(tf.float32, shape=[4, 2], name='X')

#Inserting a placeholder for a tensor equal to size of labels of the data
Y = tf.placeholder(tf.float32, shape=[4, 1], name='Y')

# In[5]:

#declaring a variable which will retain its state through multiple runs with random values from normal distribution
W = tf.Variable(tf.truncated_normal([2, 2]), name="W")

#declaring a variable which will retain its state through multiple runs with random values from normal distribution

コード例 #8

ファイルを表示

ファイル: adv_patch_train_val.py プロジェクト: PerryXDeng/detecting-adversarial-patches-capsule-networks

def main(args):
  """Run training and validation.
  
  1. Build graphs
      1.1 Training graph to run on multiple GPUs
      1.2 Validation graph to run on multiple GPUs
  2. Configure sessions
      2.1 Train
      2.2 Validate
  3. Main loop
      3.1 Train
      3.2 Write summary
      3.3 Save model
      3.4 Validate model
      
  Author:
    Perry Deng
  """
  
  # Set reproduciable random seed
  tf.set_random_seed(1234)
    
  # Directories
  train_dir, train_summary_dir = conf.setup_train_directories()
  
  # Logger
  conf.setup_logger(logger_dir=train_dir, name="logger_train.txt")
  
  # Hyperparameters
  conf.load_or_save_hyperparams(train_dir)
  
  # Get dataset hyperparameters
  logger.info('Using dataset: {}'.format(FLAGS.dataset))
  dataset_size_train = conf.get_dataset_size_train(FLAGS.dataset)\
      if not FLAGS.train_on_test else conf.get_dataset_size_test(FLAGS.dataset)
  dataset_size_val = conf.get_dataset_size_validate(FLAGS.dataset)
  build_arch = conf.get_dataset_architecture(FLAGS.dataset)
  num_classes = conf.get_num_classes(FLAGS.dataset)
  create_inputs_train = conf.get_create_inputs(FLAGS.dataset, mode="train_whole")\
      if not FLAGS.train_on_test else conf.get_create_inputs(FLAGS.dataset, mode="train_on_test")
  create_inputs_train_wholeset = conf.get_create_inputs(FLAGS.dataset, mode="train_whole")
  if dataset_size_val > 0:
    create_inputs_val   = conf.get_create_inputs(FLAGS.dataset, mode="validate")

  
 #*****************************************************************************
 # 1. BUILD GRAPHS
 #*****************************************************************************

  #----------------------------------------------------------------------------
  # GRAPH - TRAIN
  #----------------------------------------------------------------------------
  logger.info('BUILD TRAIN GRAPH')
  g_train = tf.Graph()
  with g_train.as_default(), tf.device('/cpu:0'):
    
    # Get global_step
    global_step = tf.train.get_or_create_global_step()

    # Get batches per epoch
    num_batches_per_epoch = int(dataset_size_train / FLAGS.batch_size)

    # In response to a question on OpenReview, Hinton et al. wrote the 
    # following:
    # "We use an exponential decay with learning rate: 3e-3, decay_steps: 20000,     # decay rate: 0.96."
    # https://openreview.net/forum?id=HJWLfGWRb&noteId=ryxTPFDe2X
    lrn_rate = tf.train.exponential_decay(learning_rate = FLAGS.lrn_rate, 
                        global_step = global_step,
                        decay_steps = 20000,
                        decay_rate = 0.96)
    tf.summary.scalar('learning_rate', lrn_rate)
    opt = tf.train.AdamOptimizer(learning_rate=lrn_rate)

    # Get batch from data queue. Batch size is FLAGS.batch_size, which is then 
    # divided across multiple GPUs
    input_dict = create_inputs_train()
    batch_x = input_dict['image']
    batch_labels = input_dict['label']
    
    # AG 03/10/2018: Split batch for multi gpu implementation
    # Each split is of size FLAGS.batch_size / FLAGS.num_gpus
    # See: https://github.com/naturomics/CapsNet-Tensorflow/blob/master/
    # dist_version/distributed_train.py
    splits_x = tf.split(
        axis=0, 
        num_or_size_splits=FLAGS.num_gpus, 
        value=batch_x)
    splits_labels = tf.split(
        axis=0, 
        num_or_size_splits=FLAGS.num_gpus, 
        value=batch_labels)

    
    #--------------------------------------------------------------------------
    # MULTI GPU - TRAIN
    #--------------------------------------------------------------------------
    # Calculate the gradients for each model tower
    tower_grads = []
    tower_losses = []
    tower_logits = []
    tower_target_labels = []
    reuse_variables = None
    for i in range(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('tower_%d' % i) as scope:
          logger.info('TOWER %d' % i)
          #with slim.arg_scope([slim.model_variable, slim.variable],
          # device='/cpu:0'):
          with slim.arg_scope([slim.variable], device='/cpu:0'):
            loss, logits, x, patch, target_labels = tower_fn(
                build_arch,
                splits_x[i],
                splits_labels[i],
                scope,
                num_classes,
                reuse_variables=reuse_variables,
                is_train=True)
          
          # Don't reuse variable for first GPU, but do reuse for others
          reuse_variables = True
          
          # Compute gradients for one GPU
          patch_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           "patch_params")
          grads = opt.compute_gradients(loss, var_list=patch_params)
          
          # Keep track of the gradients across all towers.
          tower_grads.append(grads)
          tower_target_labels.append(target_labels)          

          # Keep track of losses and logits across for each tower
          tower_logits.append(logits)
          tower_losses.append(loss)
          
          # Loss for each tower
          tf.summary.scalar("loss", loss)
    
    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)
    
    # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-
    # gradients-in-tensorflow-when-updating
    grad_check = ([tf.check_numerics(g, message='Gradient NaN Found!') 
                      for g, _ in grads if g is not None]
                  + [tf.check_numerics(loss, message='Loss NaN Found')])
    
    # Apply the gradients to adjust the shared variables
    with tf.control_dependencies(grad_check):
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
      with tf.control_dependencies(update_ops):
        train_op = opt.apply_gradients(grads, global_step=global_step)
    
    # Calculate mean loss     
    loss = tf.reduce_mean(tower_losses)
    
    # Calculate accuracy
    logits = tf.concat(tower_logits, axis=0)
    target_labels = tf.concat(tower_target_labels, axis=0)
    acc = met.accuracy(logits, target_labels)
    
    # Prepare predictions and one-hot labels
    probs = tf.nn.softmax(logits=logits)
    labels_oh = tf.one_hot(batch_labels, num_classes)
    
    # Group metrics together
    # See: https://cs230-stanford.github.io/tensorflow-model.html
    trn_metrics = {'loss' : loss,
             'labels' : batch_labels, 
             'labels_oh' : labels_oh,
             'logits' : logits,
             'probs' : probs,
             'acc' : acc,
             }
    
    # Reset and read operations for streaming metrics go here
    trn_reset = {}
    trn_read = {}
    
    # Logging
    tf.summary.scalar('batch_loss', loss)
    tf.summary.scalar('batch_success_rate', acc)

    # Set Saver
    # AG 26/09/2018: Save all variables including Adam so that we can continue 
    # training from where we left off
    # max_to_keep=None should keep all checkpoints
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)
    
    # Display number of parameters
    train_params = np.sum([np.prod(v.get_shape().as_list())
              for v in tf.trainable_variables()]).astype(np.int32)
    logger.info('Trainable Parameters: {}'.format(train_params))
        
    # Set summary op
    trn_summary = tf.summary.merge_all()

  #----------------------------------------------------------------------------
  # GRAPH - TRAINING SET ACCURACY
  #----------------------------------------------------------------------------
  logger.info('BUILD TRAINING SET ACCURACY GRAPH')
  g_trn_acc = tf.Graph()
  with g_trn_acc.as_default():
    # Get global_step
    global_step = tf.train.get_or_create_global_step()

    
    # Get data
    input_dict = create_inputs_train_wholeset()
    batch_x = input_dict['image']
    batch_labels = input_dict['label']
    
    # AG 10/12/2018: Split batch for multi gpu implementation
    # Each split is of size FLAGS.batch_size / FLAGS.num_gpus
    # See: https://github.com/naturomics/CapsNet-
    # Tensorflow/blob/master/dist_version/distributed_train.py
    splits_x = tf.split(
        axis=0, 
        num_or_size_splits=FLAGS.num_gpus, 
        value=batch_x)
    splits_labels = tf.split(
        axis=0, 
        num_or_size_splits=FLAGS.num_gpus, 
        value=batch_labels)
    
    
    #--------------------------------------------------------------------------
    # MULTI GPU - TRAINING SET ACCURACY
    #--------------------------------------------------------------------------
    # Calculate the logits for each model tower
    tower_logits = []
    tower_target_labels = []
    reuse_variables = None
    for i in range(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % i):
        with tf.name_scope('tower_%d' % i) as scope:
          with slim.arg_scope([slim.variable], device='/cpu:0'):
            loss, logits, x, patch, target_labels = tower_fn(
                build_arch, 
                splits_x[i], 
                splits_labels[i], 
                scope, 
                num_classes, 
                reuse_variables=reuse_variables, 
                is_train=False)

          # Don't reuse variable for first GPU, but do reuse for others
          reuse_variables = True
          
          # Keep track of losses and logits across for each tower
          tower_logits.append(logits)
          tower_target_labels.append(target_labels)
          # Loss for each tower
          tf.summary.histogram("train_set_logits", logits)
    
    # Combine logits from all towers
    logits = tf.concat(tower_logits, axis=0)
    target_labels = tf.concat(tower_target_labels, axis=0)
    # Calculate metrics
    train_set_loss = mod.spread_loss(logits, target_labels)
    train_set_acc = met.accuracy(logits, target_labels)
    
    # Prepare predictions and one-hot labels
    train_set_probs = tf.nn.softmax(logits=logits)
    train_set_labels_oh = tf.one_hot(batch_labels, num_classes)
    
    # Group metrics together
    # See: https://cs230-stanford.github.io/tensorflow-model.html
    train_set_metrics = {'loss' : train_set_loss,
                   'labels' : batch_labels, 
                   'labels_oh' : train_set_labels_oh,
                   'logits' : logits,
                   'probs' : train_set_probs,
                   'acc' : train_set_acc,
                   }
    
    # Reset and read operations for streaming metrics go here
    train_set_reset = {}
    train_set_read = {}
    saver = tf.train.Saver(max_to_keep=None)
    
    tf.summary.scalar("train_set_loss", train_set_loss)
    tf.summary.scalar("train_set_success_rate", train_set_acc)
    trn_acc_summary = tf.summary.merge_all()
  
  if dataset_size_val > 0: 
    #----------------------------------------------------------------------------
    # GRAPH - VALIDATION
    #----------------------------------------------------------------------------
    logger.info('BUILD VALIDATION GRAPH')
    g_val = tf.Graph()
    with g_val.as_default():
      # Get global_step
      global_step = tf.train.get_or_create_global_step()

      num_batches_val = int(dataset_size_val / FLAGS.batch_size)
      
      # Get data
      input_dict = create_inputs_val()
      batch_x = input_dict['image']
      batch_labels = input_dict['label']
      
      # AG 10/12/2018: Split batch for multi gpu implementation
      # Each split is of size FLAGS.batch_size / FLAGS.num_gpus
      # See: https://github.com/naturomics/CapsNet-
      # Tensorflow/blob/master/dist_version/distributed_train.py
      splits_x = tf.split(
          axis=0, 
          num_or_size_splits=FLAGS.num_gpus, 
          value=batch_x)
      splits_labels = tf.split(
          axis=0, 
          num_or_size_splits=FLAGS.num_gpus, 
          value=batch_labels)
      
      
      #--------------------------------------------------------------------------
      # MULTI GPU - VALIDATE
      #--------------------------------------------------------------------------
      # Calculate the logits for each model tower
      tower_logits = []
      tower_target_labels = []
      reuse_variables = None
      for i in range(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('tower_%d' % i) as scope:
            with slim.arg_scope([slim.variable], device='/cpu:0'):
              loss, logits, x, patch, target_labels = tower_fn(
                  build_arch, 
                  splits_x[i], 
                  splits_labels[i], 
                  scope, 
                  num_classes, 
                  reuse_variables=reuse_variables, 
                  is_train=False)

            # Don't reuse variable for first GPU, but do reuse for others
            reuse_variables = True
            
            # Keep track of losses and logits across for each tower
            tower_logits.append(logits)
            tower_target_labels.append(target_labels)
            # Loss for each tower
            tf.summary.histogram("val_logits", logits)

      # take patch and patched images from last tower
      val_patch = patch
      val_x = x

      # Combine logits from all towers
      logits = tf.concat(tower_logits, axis=0)
      target_labels = tf.concat(tower_target_labels, axis=0)
      # Calculate metrics
      val_loss = mod.spread_loss(logits, target_labels)
      val_acc = met.accuracy(logits, target_labels)
      
      # Prepare predictions and one-hot labels
      val_probs = tf.nn.softmax(logits=logits)
      val_labels_oh = tf.one_hot(batch_labels, num_classes)
      
      # Group metrics together
      # See: https://cs230-stanford.github.io/tensorflow-model.html
      val_metrics = {'loss' : val_loss,
                     'labels' : batch_labels, 
                     'labels_oh' : val_labels_oh,
                     'logits' : logits,
                     'probs' : val_probs,
                     'acc' : val_acc,
                     }
      val_images = {'patch' : val_patch,
                    'x' : val_x} 
      # Reset and read operations for streaming metrics go here
      val_reset = {}
      val_read = {}
      
      tf.summary.scalar("val_loss", val_loss)
      tf.summary.scalar("val_success_rate", val_acc)
        
      # Saver
      saver = tf.train.Saver(max_to_keep=1)
      
      # Set summary op
      val_summary = tf.summary.merge_all()
       
        
  #****************************************************************************
  # 2. SESSIONS
  #****************************************************************************
          
  #----- SESSION TRAIN -----#
  # Session settings
  #sess_train = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
  #                                              log_device_placement=False),
  #                        graph=g_train)

  # Perry: added in for RTX 2070 incompatibility workaround
  config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
  config.gpu_options.allow_growth = True
  sess_train = tf.Session(config=config, graph=g_train)

  # Debugger
  # AG 05/06/2018: Debugging using either command line or TensorBoard
  if FLAGS.debugger is not None:
    # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
    sess_train = tf_debug.TensorBoardDebugWrapperSession(sess_train, 
                                                         FLAGS.debugger)
    
  with g_train.as_default():
    sess_train.run([tf.global_variables_initializer(),
                    tf.local_variables_initializer()])
    
    # Restore previous checkpoint
    # AG 26/09/2018: where should this go???
    if FLAGS.load_dir is not None:
      prev_step = load_training(saver, sess_train, FLAGS.load_dir, opt)
    else:
      prev_step = 0

  # Create summary writer, and write the train graph
  summary_writer = tf.summary.FileWriter(train_summary_dir, 
                                         graph=sess_train.graph)


  #----- SESSION TRAIN SET ACCURACY -----#
  #sess_val = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
  #                                            log_device_placement=False),
  #                      graph=g_val)

  # Perry: added in for RTX 2070 incompatibility workaround
  config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
  config.gpu_options.allow_growth = True
  sess_train_acc = tf.Session(config=config, graph=g_trn_acc)

  with g_trn_acc.as_default():
    sess_train_acc.run([tf.local_variables_initializer(), 
                        tf.global_variables_initializer()])


  if dataset_size_val > 0:
    #----- SESSION VALIDATION -----#
    #sess_val = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
    #                                            log_device_placement=False),
    #                      graph=g_val)
 
    # Perry: added in for RTX 2070 incompatibility workaround
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    config.gpu_options.allow_growth = True
    sess_val = tf.Session(config=config, graph=g_val)


    with g_val.as_default():
      sess_val.run([tf.local_variables_initializer(), 
                    tf.global_variables_initializer()])


  #****************************************************************************
  # 3. MAIN LOOP
  #****************************************************************************
  SUMMARY_FREQ = 100
  SAVE_MODEL_FREQ = num_batches_per_epoch # 500
  VAL_FREQ = num_batches_per_epoch # 500
  PROFILE_FREQ = 5
  #print("starting main loop") 
  for step in range(prev_step, FLAGS.epoch * num_batches_per_epoch + 1): 
    #print("looping")
  #for step in range(0,3):
    # AG 23/05/2018: limit number of iterations for testing
    # for step in range(100):
    epoch_decimal = step/num_batches_per_epoch
    epoch = int(np.floor(epoch_decimal))
    

    # TF queue would pop batch until no file
    try: 
      # TRAIN
      with g_train.as_default():
    
          # With profiling
          if (FLAGS.profile is True) and ((step % PROFILE_FREQ) == 0): 
            logger.info("Train with Profiling")
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
          # Without profiling
          else:
            run_options = None
            run_metadata = None
          
          # Reset streaming metrics
          if step % (num_batches_per_epoch/4) == 1:
            logger.info("Reset streaming metrics")
            sess_train.run([trn_reset])
          
          # MAIN RUN
          tic = time.time()
          train_op_v, trn_metrics_v, trn_summary_v = sess_train.run(
              [train_op, trn_metrics, trn_summary], 
              options=run_options, 
              run_metadata=run_metadata)
          toc = time.time()
          
          # Read streaming metrics
          trn_read_v = sess_train.run(trn_read)
          
          # Write summary for profiling
          if run_options is not None: 
            summary_writer.add_run_metadata(
                run_metadata, 'epoch{:f}'.format(epoch_decimal))
          
          # Logging
          #logger.info('TRN'
          #      + ' e-{:d}'.format(epoch)
          #      + ' stp-{:d}'.format(step) 
          #        )
          #      + ' {:.2f}s'.format(toc - tic) 
          #      + ' loss: {:.4f}'.format(trn_metrics_v['loss'])
          #      + ' acc: {:.2f}%'.format(trn_metrics_v['acc']*100)
          #       )

    except KeyboardInterrupt:
      sess_train.close()
      sess_val.close()
      sys.exit()
      
    except tf.errors.InvalidArgumentError as e:
      logger.warning('%d iteration contains NaN gradients. Discard.' % step)
      logger.error(str(e))
      continue
      
    else:
      # WRITE SUMMARY
      if (step % SUMMARY_FREQ) == 0:
        logger.info("Write Train Summary")
        with g_train.as_default():
          # Summaries from graph
          summary_writer.add_summary(trn_summary_v, step)
          
      # SAVE MODEL
      if (step % SAVE_MODEL_FREQ) == 0:
        logger.info("Save Model")
        with g_train.as_default():
          train_checkpoint_dir = train_dir + '/checkpoint'
          if not os.path.exists(train_checkpoint_dir):
            os.makedirs(train_checkpoint_dir)

          # Save ckpt from train session
          ckpt_path = os.path.join(train_checkpoint_dir, 'model.ckpt' + str(epoch))
          saver.save(sess_train, ckpt_path, global_step=step)
      if (step % VAL_FREQ) == 0:
        # calculate metrics every epoch
        with g_trn_acc.as_default():
          logger.info("Start Train Set Accuracy")
          # Restore ckpt to val session
          latest_ckpt = tf.train.latest_checkpoint(train_checkpoint_dir)
          saver.restore(sess_train_acc, latest_ckpt)
          
          # Reset accumulators
          accuracy_sum = 0
          loss_sum = 0
          sess_train_acc.run(train_set_reset)
          
          for i in range(num_batches_per_epoch):
            train_set_metrics_v, train_set_summary_str_v = sess_train_acc.run(
                [train_set_metrics, trn_acc_summary])
            
            # Update
            accuracy_sum += train_set_metrics_v['acc']
            loss_sum += train_set_metrics_v['loss']
            
            # Read
            trn_read_v = sess_train_acc.run(val_read)
            
            # Get checkpoint number
            ckpt_num = re.split('-', latest_ckpt)[-1]

          # Average across batches
          ave_acc = accuracy_sum / num_batches_per_epoch
          ave_loss = loss_sum / num_batches_per_epoch
           
          logger.info('TRN ckpt-{}'.format(ckpt_num) 
                      + ' avg_success: {:.2f}%'.format(ave_acc*100) 
                      + ' avg_loss: {:.4f}'.format(ave_loss)
                     )
          
          logger.info("Write Train Summary")
          summary_train = tf.Summary()
          summary_train.value.add(tag="trn_success", simple_value=ave_acc)
          summary_train.value.add(tag="trn_loss", simple_value=ave_loss)
          summary_writer.add_summary(summary_train, epoch)
          

        if dataset_size_val > 0: 
          #----- Validation -----#
          with g_val.as_default():
            logger.info("Start Validation")
            
            # Restore ckpt to val session
            latest_ckpt = tf.train.latest_checkpoint(train_checkpoint_dir)
            saver.restore(sess_val, latest_ckpt)
            
            # Reset accumulators
            accuracy_sum = 0
            loss_sum = 0
            sess_val.run(val_reset)
            
            for i in range(num_batches_val):
              if i == num_batches_val - 1:
                # take a sample of patched images on the last validation batch
                val_metrics_v, val_summary_str_v, val_images_v = sess_val.run(
                    [val_metrics, val_summary, val_images])
                x = val_images_v['x']
                patch = val_images_v['patch']
              else:
                val_metrics_v, val_summary_str_v = sess_val.run(
                    [val_metrics, val_summary])
              # Update
              accuracy_sum += val_metrics_v['acc']
              loss_sum += val_metrics_v['loss']
              
              # Read
              val_read_v = sess_val.run(val_read)
              
              # Get checkpoint number
              ckpt_num = re.split('-', latest_ckpt)[-1]

              # Logging
              #logger.info('VAL ckpt-{}'.format(ckpt_num) 
              #            + ' bch-{:d}'.format(i) 
              #            + ' cum_acc: {:.2f}%'.format(accuracy_sum/(i+1)*100) 
              #            + ' cum_loss: {:.4f}'.format(loss_sum/(i+1))
              #           )
            
            # Average across batches
            ave_acc = accuracy_sum / num_batches_val
            ave_loss = loss_sum / num_batches_val
             
            logger.info('VAL ckpt-{}'.format(ckpt_num) 
                        + ' avg_success: {:.2f}%'.format(ave_acc*100) 
                        + ' avg_loss: {:.4f}'.format(ave_loss)
                       )
            
            logger.info("Write Val Summary")
            summary_val = tf.Summary()
            summary_val.value.add(tag="val_success", simple_value=ave_acc)
            summary_val.value.add(tag="val_loss", simple_value=ave_loss)
            summary_writer.add_summary(summary_val, epoch)
            log_images(summary_writer, "patch", [patch], epoch)
            log_images(summary_writer, "patched_input", x, epoch)
            if patch.shape[-1] == 1:
              patch = np.squeeze(patch, axis=-1)
            formatted = (patch * 255).astype('uint8')
            img = Image.fromarray(formatted)
            img.save(os.path.join(train_dir, "saved_patch.png"))
 
  # Close (main loop)
  sess_train.close()
  sess_val.close()
  sys.exit()

コード例 #9

ファイルを表示

    # M = VDEModelDesc(info_params)
    hps = get_default_hparams()
    hps.C = 10
    hps.T = 28
    hps.D = 28
    hps.n_z = 28
    M = ModelDesc(hps)

    logger.auto_set_dir(action='d')
    ds_train, ds_test = get_mnist_data()
    # sess = SessionCreatorAdapter(NewSessionCreator(), lambda sess: tf_debug.LocalCLIDebugWrapperSession(sess))
    # sess = tf_debug.TensorBoardDebugWrapperSession(sess, "nam-pc:7000")

    creator = SessionCreatorAdapter(
        NewSessionCreator(),
        lambda sess: tf_debug.TensorBoardDebugWrapperSession(
            sess, "nam-pc:7000"))
    # Trainer(input=QueueInput(ds_train), model=M).train_with_defaults(
    #     callbacks=[
    #         ModelSaver(),
    #         callbacks.MergeAllSummaries(),
    #         MinSaver('total_loss'),
    #         InferenceRunner(ds_test, [ScalarStats('predict_trend/accuracy_')])
    #     ],
    #     steps_per_epoch=info_params.steps_per_epoch,
    #     max_epoch=info_params.epochs,
    #     # session_init=SaverRestore(args.load) if args.load else None

    # )

    Trainer(input=QueueInput(ds_train), model=M).train_with_defaults(
        callbacks=[

コード例 #10

ファイルを表示

ファイル: network.py プロジェクト: iseong83/3d_object_reconstruction

    def __init__(self, params=None):
        # read params
        if params is None:
            self.params = utils.read_params()
        else:
            self.params = params

        if self.params["TRAIN"]["INITIALIZER"] == "XAVIER":
            init = tf.contrib.layers.xavier_initializer()
        else:
            init = tf.random_normal_initializer()

        self.CREATE_TIME = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
        self.MODEL_DIR = "{}/model_{}".format(
            self.params["DIRS"]["MODELS_LOCAL"], self.CREATE_TIME)
        utils.make_dir(self.MODEL_DIR)

        with open(self.MODEL_DIR + '/params.json', 'w') as f:
            json.dump(self.params, f)

        # place holders
        with tf.name_scope("Data"):
            self.X = tf.placeholder(tf.float32, [None, None, None, None, None])
        with tf.name_scope("Labels"):
            if "64" in self.params["TRAIN"]["DECODER_MODE"]:
                self.Y_onehot = tf.placeholder(tf.float32,
                                               [None, 64, 64, 64, 2])
            else:
                self.Y_onehot = tf.placeholder(tf.float32,
                                               [None, 32, 32, 32, 2])
        with tf.name_scope("LearningRate"):
            self.LR = tf.placeholder(tf.float32, [])

        print("Initializing Network")
        #pp = preprocessor.Preprocessor(self.X) # here
        #X_preprocessed = pp.out_tensor # here
        X_preprocessed = self.X  # (n_batch, n_views, 127, 127, 3)
        n_batchsize = tf.shape(X_preprocessed)[0]

        # switch batch <-> nviews
        X_preprocessed = tf.transpose(X_preprocessed, [1, 0, 2, 3, 4])
        # encoder
        print("encoder")
        if self.params["TRAIN"]["ENCODER_MODE"] == "DILATED":
            en = encoder.Dilated_Encoder(X_preprocessed)
        elif self.params["TRAIN"]["ENCODER_MODE"] == "RESIDUAL":
            en = encoder.Residual_Encoder(X_preprocessed)
        elif self.params["TRAIN"]["ENCODER_MODE"] == "SERESNET":
            en = encoder.SENet_Encoder(X_preprocessed)
        else:
            en = encoder.Simple_Encoder(X_preprocessed)
        encoded_input = en.out_tensor
        # switch batch <-> nviews
        encoded_input = tf.transpose(encoded_input, [1, 0, 2])
        X_preprocessed = tf.transpose(X_preprocessed, [1, 0, 2, 3, 4])

        # visualize transformation of input state to voxel
        if self.params["VIS"]["ENCODER_PROCESS"]:
            with tf.name_scope("misc"):
                feature_maps = tf.get_collection("feature_maps")
                fm_list = []
                for fm in feature_maps:
                    fm_slice = fm[0, 0, :, :, 0]
                    #fm_shape = fm_slice.get_shape().as_list()
                    fm_shape = tf.shape(fm_slice)
                    fm_slice = tf.pad(fm_slice,
                                      [[0, 0], [127 - fm_shape[0], 0]])
                    fm_list.append(fm_slice)
                fm_img = tf.concat(fm_list, axis=0)
                tf.summary.image("feature_map_list",
                                 tf.expand_dims(tf.expand_dims(fm_img, -1), 0))

        # recurrent_module
        print("recurrent_module")
        with tf.name_scope("Recurrent_module"):
            rnn_mode = self.params["TRAIN"]["RNN_MODE"]
            n_cell = self.params["TRAIN"]["RNN_CELL_NUM"]
            n_hidden = self.params["TRAIN"]["RNN_HIDDEN_SIZE"]

            if rnn_mode == "LSTM":
                rnn = recurrent_module.LSTM_Grid(initializer=init)
                hidden_state = (
                    tf.zeros([n_batchsize, n_cell, n_cell, n_cell, n_hidden],
                             name="zero_hidden_state"),
                    tf.zeros([n_batchsize, n_cell, n_cell, n_cell, n_hidden],
                             name="zero_cell_state"))
            else:
                rnn = recurrent_module.GRU_Grid(initializer=init)
                hidden_state = tf.zeros(
                    [n_batchsize, n_cell, n_cell, n_cell, n_hidden],
                    name="zero_hidden_state")

            #n_timesteps = self.params["TRAIN"]["TIME_STEP_COUNT"]
            n_timesteps = np.shape(X_preprocessed)[1]
            # feed a limited seqeuence of images
            if isinstance(n_timesteps, int) and n_timesteps > 0:
                for t in range(n_timesteps):
                    hidden_state = rnn.call(encoded_input[:, t, :],
                                            hidden_state)
            else:  # feed an arbitray seqeuence of images
                n_timesteps = tf.shape(X_preprocessed)[1]

                t = tf.constant(0)

                def condition(h, t):
                    return tf.less(t, n_timesteps)

                def body(h, t):
                    h = rnn.call(encoded_input[:, t, :], h)
                    t = tf.add(t, 1)
                    return h, t

                hidden_state, t = tf.while_loop(condition, body,
                                                (hidden_state, t))

        # decoder
        print("decoder")
        if isinstance(hidden_state, tuple):
            hidden_state = hidden_state[0]
        if self.params["TRAIN"]["DECODER_MODE"] == "DILATED":
            de = decoder.Dilated_Decoder(hidden_state)
        elif self.params["TRAIN"]["DECODER_MODE"] == "RESIDUAL":
            de = decoder.Residual_Decoder(hidden_state)
        elif self.params["TRAIN"]["DECODER_MODE"] == "RESIDUAL64":
            de = decoder.Residual_Decoder64(hidden_state)
        elif self.params["TRAIN"]["DECODER_MODE"] == "SERESNET":
            de = decoder.SENet_Decoder(hidden_state)
        elif self.params["TRAIN"]["DECODER_MODE"] == "SERESNET64":
            de = decoder.SENet_Decoder64(hidden_state)
        else:
            de = decoder.Simple_Decoder(hidden_state)
        self.logits = de.out_tensor

        # visualize transformation of hidden state to voxel
        if self.params["VIS"]["DECODER_PROCESS"]:
            with tf.name_scope("misc"):
                feature_voxels = tf.get_collection("feature_voxels")
                fv_list = []
                for fv in feature_voxels:
                    fv_slice = fv[0, :, :, 0, 0]
                    fv_shape = fv_slice.get_shape().as_list()
                    if "64" in self.params["TRAIN"]["DECODER_MODE"]:
                        fv_slice = tf.pad(fv_slice,
                                          [[0, 0], [64 - fv_shape[0], 0]])
                    else:
                        fv_slice = tf.pad(fv_slice,
                                          [[0, 0], [32 - fv_shape[0], 0]])
                    fv_list.append(fv_slice)
                fv_img = tf.concat(fv_list, axis=0)
                tf.summary.image("feature_voxel_list",
                                 tf.expand_dims(tf.expand_dims(fv_img, -1), 0))

        # loss
        print("loss")
        if self.params["TRAIN"]["LOSS_FCN"] == "FOCAL_LOSS":
            voxel_loss = loss.Focal_Loss(self.Y_onehot, self.logits)
            self.softmax = voxel_loss.pred
        elif self.params["TRAIN"]["LOSS_FCN"] == "WEIGHTED_SOFTMAX":
            voxel_loss = loss.Weighted_Voxel_Softmax(self.Y_onehot,
                                                     self.logits)
            self.softmax = voxel_loss.softmax
        elif self.params["TRAIN"]["LOSS_FCN"] == "SOFTMAX":
            voxel_loss = loss.Voxel_Softmax(self.Y_onehot, self.logits)
            self.softmax = voxel_loss.softmax
        else:
            print("WRONG LOSS FUNCTION. CHECK LOSS")
            os.abort()
        self.loss = voxel_loss.loss
        tf.summary.scalar("loss", self.loss)

        # misc
        print("misc")
        with tf.name_scope("misc"):
            self.step_count = tf.Variable(0,
                                          trainable=False,
                                          name="step_count")
            self.print = tf.Print(self.loss, [self.step_count, self.loss, t])

        # optimizer
        print("optimizer")
        if self.params["TRAIN"]["OPTIMIZER"] == "ADAM":
            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.LR,
                epsilon=self.params["TRAIN"]["ADAM_EPSILON"])
            #learning_rate=self.params["TRAIN"]["ADAM_LEARN_RATE"], epsilon=self.params["TRAIN"]["ADAM_EPSILON"])
            tf.summary.scalar("adam_learning_rate", optimizer._lr)
        else:
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=self.LR)
            #learning_rate=self.params["TRAIN"]["GD_LEARN_RATE"])
            tf.summary.scalar("learning_rate", optimizer._learning_rate)

        grads_and_vars = optimizer.compute_gradients(self.loss)
        self.apply_grad = optimizer.apply_gradients(
            grads_and_vars, global_step=self.step_count)

        # metric
        print("metrics")
        with tf.name_scope("metrics"):
            Y = tf.argmax(self.Y_onehot, -1)
            predictions = tf.argmax(self.softmax, -1)
            acc, acc_op = tf.metrics.accuracy(Y, predictions)
            rms, rms_op = tf.metrics.root_mean_squared_error(
                self.Y_onehot, self.softmax)
            iou, iou_op = tf.metrics.mean_iou(Y, predictions, 2)
            self.metrics_op = tf.group(acc_op, rms_op, iou_op)

        tf.summary.scalar("accuracy", acc)
        tf.summary.scalar("rmse", rms)
        tf.summary.scalar("iou", iou)

        # initalize
        # config=tf.ConfigProto(log_device_placement=True)
        print("setup")
        self.summary_op = tf.summary.merge_all()
        self.sess = tf.InteractiveSession()
        if self.params["MODE"] == "DEBUG":
            self.sess = tf_debug.TensorBoardDebugWrapperSession(
                self.sess,
                "nat-oitwireless-inside-vapornet100-c-15126.Princeton.EDU:6064"
            )

        # summaries
        print("summaries")
        if self.params["MODE"] == "TEST":
            self.test_writer = tf.summary.FileWriter(
                "{}/test".format(self.MODEL_DIR), self.sess.graph)
        else:
            self.train_writer = tf.summary.FileWriter(
                "{}/train".format(self.MODEL_DIR), self.sess.graph)
            self.val_writer = tf.summary.FileWriter(
                "{}/val".format(self.MODEL_DIR), self.sess.graph)

        # initialize
        print("initialize")
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        print('trainable vars:', len(tf.trainable_variables()))
        print("...done!")

コード例 #11

ファイルを表示

def run_model(model, raw_cohort, delta_encoder):
    """
    Run the given model using the given cohort and experimental settings contained in args.

    This function:
    (1) balanced the dataset
    (2) splits the cohort intro training:development:testing sets at the patient-level
    (3) trains PRONTO and saves checkpoint/summaries for TensorBoard
    (4) evaluates PRONTO on the development and testing set
    :param model: an instantiated PRONTO model
    :type model: modeling.PRONTOModel
    :param raw_cohort: the cohort to use for this experimental run
    :type raw_cohort: preprocess.Cohort
    :param delta_encoder: encoder used to represented elapsed time deltas
    :type delta_encoder: preprocess.DeltaEncoder
    :return: nothing
    """

    import scipy

    snapshot_sizes = []
    for chronology in raw_cohort.chronologies():
        for snapshot in chronology.snapshots:
            snapshot_sizes.append(len(snapshot))
    print('Statistics on snapshot sizes:', scipy.stats.describe(snapshot_sizes))

    days_til_onset = []
    for chronology in raw_cohort.chronologies():
        seconds = 0
        for delta in chronology.deltas:
            seconds += delta
        days_til_onset.append(seconds / 60 / 60 / 24)
    print('Statistics on days until disease onset:', scipy.stats.describe(days_til_onset))

    elapsed_times = []
    for chronology in raw_cohort.chronologies():
        for delta in chronology.deltas:
            elapsed_times.append(delta  / 60 / 60 / 24)
    print('Statistics on elapsed time:', scipy.stats.describe(elapsed_times))

    lengths = []
    for chronology in raw_cohort.chronologies():
        lengths.append(len(chronology))
    print('Statistics on chronology lengths:', scipy.stats.describe(lengths))

    # Balance the cohort to have an even number of positive/negative chronologies for each patient
    cohort = raw_cohort.balance_chronologies()

    # Split into training:development:testing
    train, devel, test = make_train_devel_test_split(cohort.patients(), FLAGS.tdt_ratio)

    # Save summaries and checkpoints into the directories passed to the script
    model_file = 'ln=%d_delta=%s_d=%.2f_vd=%.2f_lr=%g_bs=%d' % (
        1 if FLAGS.rnn_layer_norm else 0,
        'disc' if FLAGS.use_discrete_deltas else 'tanh',
        FLAGS.dropout,
        FLAGS.vocab_dropout,
        FLAGS.learning_rate,
        FLAGS.batch_size,
    )
    model_summaries_dir = os.path.join(FLAGS.output_dir, FLAGS.optimizer, FLAGS.rnn_cell_type,
                                       FLAGS.snapshot_encoder, model_file)
    model_checkpoint_dir = os.path.join(FLAGS.output_dir, FLAGS.optimizer, FLAGS.rnn_cell_type,
                                        FLAGS.snapshot_encoder, model_file, 'pronto_model')

    # Clear any previous summaries/checkpoints if asked
    if FLAGS.clear_prev:
        nio.delete_dir_quiet(model_summaries_dir)
        nio.delete_dir_quiet(model_checkpoint_dir)
        print('Deleted previous model summaries/checkpoints')

    # Make output directories so we don't blow up when saving
    nio.make_dirs_quiet(model_checkpoint_dir)

    # Instantiate PRONTO optimizer and summarizer classes
    if FLAGS.optimizer == 'PRONTO':
        optimizer = optimization.PRONTOOptimizer(model, learning_rate=FLAGS.learning_rate, sparse=True)
    elif FLAGS.optimizer == 'BERT':
        epoch_steps = len(cohort[train].make_epoch_batches(batch_size=FLAGS.batch_size,
                                                           max_snapshot_size=FLAGS.max_snapshot_size,
                                                           max_chrono_length=FLAGS.max_chrono_length,
                                                           delta_encoder=delta_encoder))
        optimizer = optimization.BERTOptimizer(model,
                                               num_train_steps=epoch_steps * FLAGS.num_epochs,
                                               num_warmup_steps=epoch_steps * 3,
                                               init_lr=FLAGS.learning_rate)
        print('Created BERT-like optimizer with initial learning rate of %f' % FLAGS.learning_rate)
    else:
        raise NotImplementedError('No optimizer available for %s' % FLAGS.optimizer)

    # noinspection PyUnboundLocalVariable
    summarizer = summarization.PRONTOSummarizer(model, optimizer)

    # Now that everything has been defined in TensorFlow's computation graph, initialize our model saver
    saver = tf.train.Saver(tf.global_variables())

    first_cohort = cohort

    # Tell TensorFlow to wake up and get ready to rumble
    with tf.Session() as sess:

        # If we specified a TensorBoard debug server, connect to it
        # (this is actually pretty sweet but you have to manually step through your model's flow so 99% of the time
        # you shouldn't need it)
        if FLAGS.debug is not None:
            sess = tf_debug.TensorBoardDebugWrapperSession(sess, FLAGS.debug)

        # Create our summary writer (used by TensorBoard)
        summary_writer = tf.summary.FileWriter(model_summaries_dir, sess.graph)

        # Restore model if it exists (and we didn't clear it), otherwise create a shiny new one
        checkpoint = tf.train.get_checkpoint_state(model_checkpoint_dir)
        if checkpoint and gfile.Exists(checkpoint.model_checkpoint_path + '.index'):
            print("Reading model parameters from '%s'...", checkpoint.model_checkpoint_path)
            saver.restore(sess, checkpoint.model_checkpoint_path)
        else:
            print("Creating model with fresh parameters...")
            sess.run(tf.global_variables_initializer())

        # Initialize local variables (these are just used for computing average metrics)
        sess.run(tf.local_variables_initializer())

        # Create a progress logger to monitor training (this is a wrapped version of range()
        with trange(FLAGS.num_epochs, desc='Training') as train_log:
            # Save the training, development, and testing metrics for our best model (as measured by devel F1)
            # I'm lazy so I initialize best_devel_metrics with a zero F1 so I can compare the first iteration to it
            best_train_metrics, best_devel_metrics, best_test_metrics = {}, {'F2': 0}, {}
            # Iterate over training epochs
            for i in train_log:
                # Get global step and reset training metrics
                global_step, _ = sess.run([optimizer.global_step, summarizer.train.reset_op])
                # Log our progress on the current epoch using tqdm cohort.make_epoch_batches shuffles the order of
                # chronologies and prepares them  into mini-batches with zero-padding if needed
                total_loss = 0.
                batches = cohort[train].make_epoch_batches(batch_size=FLAGS.batch_size,
                                                           max_snapshot_size=FLAGS.max_snapshot_size,
                                                           max_chrono_length=FLAGS.max_chrono_length,
                                                           delta_encoder=delta_encoder)
                num_batches = len(batches)
                with tqdm(batches, desc='Epoch %d' % (i + 1)) as batch_log:
                    # Iterate over each batch
                    for j, batch in enumerate(batch_log):
                        # We train the model by evaluating the optimizer's training op. At the same time we update the
                        # training metrics and get metrics/summaries for the current batch and request the new global
                        # step number (used by TensorBoard to coordinate metrics across different runs
                        _, batch_summary, batch_metrics, global_step = sess.run(
                            [[optimizer.train_op, summarizer.train.metric_ops],  # All fetches we aren't going to read
                             summarizer.batch_summary, summarizer.batch_metrics,
                             optimizer.global_step],
                            batch.feed(model, training=True))

                        # Update tqdm progress indicator with current training metrics on this batch
                        batch_log.set_postfix(batch_metrics)

                        # Save batch-level summaries
                        summary_writer.add_summary(batch_summary, global_step=global_step)

                        total_loss += batch_metrics['Loss']

                # Save epoch-level training metrics and summaries
                train_metrics, train_summary = sess.run([summarizer.train.metrics, summarizer.train.summary])
                train_metrics['Loss'] = total_loss / num_batches
                summary_writer.add_summary(train_summary, global_step=global_step)

                # Re-sample chronologies in cohort
                cohort = raw_cohort.balance_chronologies()

                # Evaluate development performance
                sess.run(summarizer.devel.reset_op)
                # Update local variables used to compute development metrics as we process each batch
                for devel_batch in first_cohort[devel].make_epoch_batches(batch_size=FLAGS.batch_size,
                                                                          max_snapshot_size=FLAGS.max_snapshot_size,
                                                                          max_chrono_length=FLAGS.max_chrono_length,
                                                                          delta_encoder=delta_encoder):
                    sess.run([summarizer.devel.metric_ops], devel_batch.feed(model, training=False))
                # Compute the development metrics
                devel_metrics, devel_summary = sess.run([summarizer.devel.metrics, summarizer.devel.summary])
                # Update training progress bar to indicate current performance on development set
                train_log.set_postfix(devel_metrics)
                # Save TensorBoard summary
                summary_writer.add_summary(devel_summary, global_step=global_step)

                def format_metrics(metrics: dict):
                    return dict((key, '%6.4f' % value) for key, value in metrics.items())

                train_log.write('Epoch %d. Train: %s | Devel: %s' % (i + 1,
                                                                     format_metrics(train_metrics),
                                                                     format_metrics(devel_metrics)))

                # Evaluate testing performance exactly as described above for development
                sess.run(summarizer.test.reset_op)
                for batch in first_cohort[test].make_epoch_batches(batch_size=FLAGS.batch_size,
                                                                   max_snapshot_size=FLAGS.max_snapshot_size,
                                                                   max_chrono_length=FLAGS.max_chrono_length,
                                                                   delta_encoder=delta_encoder):
                    sess.run([summarizer.test.metrics, summarizer.test.metric_ops], batch.feed(model, training=False))
                test_metrics, test_summary = sess.run([summarizer.test.metrics, summarizer.test.summary])
                summary_writer.add_summary(test_summary, global_step=global_step)

                # If this run did better on the dev set, save it as the new best model
                if devel_metrics['F2'] > best_devel_metrics['F2']:
                    best_devel_metrics = devel_metrics
                    best_train_metrics = train_metrics
                    best_test_metrics = test_metrics
                    # Save the model
                    saver.save(sess, model_checkpoint_dir, global_step=global_step)
                elif FLAGS.early_term:
                    tqdm.write('Early termination!')
                    break

        print('Training complete!')

        if FLAGS.print_performance:
            print('Train: %s' % str(best_train_metrics))
            print('Devel: %s' % str(best_devel_metrics))
            print('Test: %s' % str(best_test_metrics))

        if FLAGS.save_tabbed_results:
            with open(os.path.join(model_summaries_dir, 'results.tsv'), 'w') as outfile:
                print_table_results(best_train_metrics, best_devel_metrics, best_test_metrics, 'simple',
                                    file=outfile)

        if FLAGS.save_latex_results:
            with open(os.path.join(model_summaries_dir, 'results.tex'), 'w') as outfile:
                print_table_results(best_train_metrics, best_devel_metrics, best_test_metrics, 'latex_booktabs',
                                    file=outfile)

コード例 #12

ファイルを表示

def main(_):
    # Import data
    if FLAGS.fake_data:
        imgs = tf.random.uniform(maxval=256,
                                 shape=(10, 28, 28),
                                 dtype=tf.int32)
        labels = tf.random.uniform(maxval=10, shape=(10, ), dtype=tf.int32)
        mnist_train = imgs, labels
        mnist_test = imgs, labels
    else:
        mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()

    def format_example(imgs, labels):
        imgs = tf.reshape(imgs, [-1, 28 * 28])
        imgs = tf.cast(imgs, tf.float32) / 255.0
        labels = tf.one_hot(labels, depth=10, dtype=tf.float32)
        return imgs, labels

    ds_train = tf.data.Dataset.from_tensor_slices(mnist_train)
    ds_train = ds_train.shuffle(1000, seed=RAND_SEED).repeat().batch(
        FLAGS.train_batch_size)
    ds_train = ds_train.map(format_example)
    it_train = ds_train.make_initializable_iterator()

    ds_test = tf.data.Dataset.from_tensors(mnist_test).repeat()
    ds_test = ds_test.map(format_example)
    it_test = ds_test.make_initializable_iterator()

    sess = tf.InteractiveSession()

    # Create the MNIST neural network graph.

    # Input placeholders.
    with tf.name_scope("input"):
        handle = tf.placeholder(tf.string, shape=())

        iterator = tf.data.Iterator.from_string_handle(
            handle, (tf.float32, tf.float32),
            ((None, IMAGE_SIZE * IMAGE_SIZE), (None, 10)))

        x, y_ = iterator.get_next()

    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1, seed=RAND_SEED)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def nn_layer(input_tensor,
                 input_dim,
                 output_dim,
                 layer_name,
                 act=tf.nn.relu):
        """Reusable code for making a simple neural net layer."""
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope("weights"):
                weights = weight_variable([input_dim, output_dim])
            with tf.name_scope("biases"):
                biases = bias_variable([output_dim])
            with tf.name_scope("Wx_plus_b"):
                preactivate = tf.matmul(input_tensor, weights) + biases

            activations = act(preactivate)
            return activations

    hidden = nn_layer(x, IMAGE_SIZE**2, HIDDEN_SIZE, "hidden")
    logits = nn_layer(hidden, HIDDEN_SIZE, NUM_LABELS, "output", tf.identity)
    y = tf.nn.softmax(logits)

    with tf.name_scope("cross_entropy"):
        # The following line is the culprit of the bad numerical values that appear
        # during training of this graph. Log of zero gives inf, which is first seen
        # in the intermediate tensor "cross_entropy/Log:0" during the 4th run()
        # call. A multiplication of the inf values with zeros leads to nans,
        # which is first in "cross_entropy/mul:0".
        #
        # You can use the built-in, numerically-stable implementation to fix this
        # issue:
        #   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)

        diff = -(y_ * tf.log(y))
        with tf.name_scope("total"):
            cross_entropy = tf.reduce_mean(diff)

    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(
            FLAGS.learning_rate).minimize(cross_entropy)

    with tf.name_scope("accuracy"):
        with tf.name_scope("correct_prediction"):
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        with tf.name_scope("accuracy"):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    sess.run(tf.global_variables_initializer())
    sess.run(it_train.initializer)
    sess.run(it_test.initializer)
    train_handle = sess.run(it_train.string_handle())
    test_handle = sess.run(it_test.string_handle())

    if FLAGS.debug and FLAGS.tensorboard_debug_address:
        raise ValueError(
            "The --debug and --tensorboard_debug_address flags are mutually "
            "exclusive.")
    if FLAGS.debug:
        if FLAGS.use_random_config_path:
            _, config_file_path = tempfile.mkstemp(".tfdbg_config")
        else:
            config_file_path = None
        sess = tf_debug.LocalCLIDebugWrapperSession(
            sess, ui_type=FLAGS.ui_type, config_file_path=config_file_path)
    elif FLAGS.tensorboard_debug_address:
        sess = tf_debug.TensorBoardDebugWrapperSession(
            sess, FLAGS.tensorboard_debug_address)

    # Add this point, sess is a debug wrapper around the actual Session if
    # FLAGS.debug is true. In that case, calling run() will launch the CLI.
    for i in range(FLAGS.max_steps):
        acc = sess.run(accuracy, feed_dict={handle: test_handle})
        print("Accuracy at step %d: %s" % (i, acc))

        sess.run(train_step, feed_dict={handle: train_handle})

コード例 #13

ファイルを表示

def main(config):
    # Import data
    mnist = input_data.read_data_sets(config.data_dir,
                                      one_hot=True,
                                      fake_data=config.fake_data)

    def feed_dict(train):
        if train or config.fake_data:
            xs, ys = mnist.train.next_batch(config.batch_size,
                                            fake_data=config.fake_data)
        else:
            xs, ys = mnist.test.images, mnist.test.labels

        return {x: xs, y_: ys}

    sess = tf.InteractiveSession()

    # Create the MNIST neural network graph.

    # Input placeholders.
    with tf.name_scope("input"):
        x = tf.placeholder(tf.float32, [None, config.image_size**2],
                           name="x-input")
        y_ = tf.placeholder(tf.float32, [None, config.num_classes],
                            name="y-input")

    hidden = tf.layers.dense(
        x,
        config.hidden_size,
        activation=tf.nn.relu,
        kernel_initializer=tf.initializers.truncated_normal(stddev=0.1,
                                                            seed=config.seed),
        name="hidden")
    logits = tf.layers.dense(
        hidden,
        config.num_classes,
        kernel_initializer=tf.initializers.truncated_normal(stddev=0.1,
                                                            seed=config.seed),
        name="logits")
    y = tf.nn.softmax(logits)

    with tf.name_scope("cross_entropy"):
        # The following line is the culprit of the bad numerical values that appear
        # during training of this graph. Log of zero gives inf, which is first seen
        # in the intermediate tensor "cross_entropy/Log:0" during the 4th run()
        # call. A multiplication of the inf values with zeros leads to nans,
        # which is first in "cross_entropy/mul:0".
        #
        # You can use the built-in, numerically-stable implementation to fix this
        # issue:
        #   diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)

        diff = -(y_ * tf.log(y))
        with tf.name_scope("total"):
            cross_entropy = tf.reduce_mean(diff)

    with tf.name_scope("train"):
        train_step = tf.train.AdamOptimizer(
            config.learning_rate).minimize(cross_entropy)

    with tf.name_scope("accuracy"):
        with tf.name_scope("correct_prediction"):
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        with tf.name_scope("accuracy"):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    sess.run(tf.global_variables_initializer())

    if config.debug and config.tensorboard_debug_address:
        raise ValueError(
            "The --debug and --tensorboard_debug_address config are mutually exclusive."
        )
    if config.debug:
        sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                    ui_type=config.ui_type)
    elif config.tensorboard_debug_address:
        sess = tf_debug.TensorBoardDebugWrapperSession(
            sess, config.tensorboard_debug_address)

    # Add this point, sess is a debug wrapper around the actual Session if
    # config.debug is true. In that case, calling run() will launch the CLI.
    for i in range(config.max_steps):
        acc = sess.run(accuracy, feed_dict=feed_dict(False))
        print("Accuracy at step %d: %s" % (i, acc))

        sess.run(train_step, feed_dict=feed_dict(True))

コード例 #14

ファイルを表示

def train():
    gamma = 0.99
    episodes = 100
    batch_size = 128
    max_time_steps = 200
    episode_reward = 0
    reward_history = []
    env = gym.make('Pendulum-v0')
    obs_old = env.reset()
    memory = Memory(limit=int(1e6),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    agent_policy = Policy(env, "policy")
    agent_critic = Value(env, "value")
    # agent_policy_t = Policy(env,"policy_t")
    # agent_critic_t = Value(env,"value_t")

    #initial rollouts to gather date
    for i in range(10000):
        action = env.action_space.sample()
        obs, rew, done, _ = env.step(action)
        episode_reward += rew

        memory.append(obs_old, action, rew, obs, done)
        obs_old = obs
        if done:
            # reward_history.append(episode_reward)
            episode_reward = 0
            env.reset()
    episode_reward = 0
    time_t = 200
    tf.summary.scalar("episode_time_steps", time_t)
    tf.summary.scalar("episode_reward", episode_reward)
    merged = tf.summary.merge_all()
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    mark = np.zeros([8, 1])
    time_split = np.zeros_like(mark)
    with tf.Session() as sess:
        sess1 = tf_debug.TensorBoardDebugWrapperSession(sess, "Vader:6007")
        # tf_debug.LocalCLIDebugWrapperSession(sess)
        from datetime import datetime
        now = datetime.now()
        train_writer = tf.summary.FileWriter(
            './train/' + now.strftime("%Y%m%d-%H%M%S") + '/', sess.graph)
        # train_writer = tf.summary.FileWriter('.' + '/train', sess.graph)

        agent_critic.create_target(0.1)
        agent_policy.create_target(0.1)
        # agent_policy_t.create_target_capacity(agent_policy.get_trainable_parameters(),0.6)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.initialize_all_variables())
        sess.run(tf.initialize_local_variables())
        sess1.run(agent_policy.get_trainable_parameters())
        # agent_critic.set_trainable_parameters(agent_critic.get_trainable_parameters(), 0)
        # agent_policy.set_trainable_parameters(agent_policy.get_trainable_parameters(), 0)
        for i in range(episodes):
            print('running episode:', i)
            t = 0
            done = 0

            while t < max_time_steps:
                # print(t)
                start = Time.time()
                action = agent_policy.predict(
                    obs_old)  # + agent_policy.noise(0,1/episode_reward)
                mark[0] = Time.time() - start
                # print('mark1:'+str(mark1))
                action = action.reshape(-1)
                # if action>0.5:
                #     action = 1
                # else:
                #     action = -0
                # action = env.action_space.sample()
                obs, rew, done, info = env.step(action)
                # env.render()
                episode_reward += rew
                memory.append(obs_old, action, rew, obs, done)

                # print('mark2:' + str(mark2))
                if done or t == max_time_steps - 1:
                    time_t = t
                    reward_history.append(episode_reward)
                    episode_reward = 0
                    env.reset()
                obs_old = obs
                t += 1

            for steps in range(50):
                batch = memory.sample(batch_size)
                obs_batch = batch['obs0']
                obs_batch -= np.mean(obs_batch, 0)
                obs_batch = obs_batch / np.var(obs_batch, 0)
                agent_policy.act_as_target = True
                action_batch_predict = agent_policy.predict(obs_batch)[0]
                agent_policy.act_as_target = False
                agent_critic.act_as_target = True
                value_batch = agent_critic.predict(obs_batch,
                                                   action_batch_predict)[0]
                # print(value_batch[0])
                agent_critic.act_as_target = False
                y = np.array(batch['rewards']) + gamma * np.array(
                    value_batch)  #.reshape(-1,batch_size)
                agent_critic.update_value(obs=obs_batch,
                                          action=action_batch_predict,
                                          target=y)
                q_grad = np.array(
                    agent_critic.get_q_gradient(action_batch_predict,
                                                obs_batch)).reshape(
                                                    -1,
                                                    env.action_space.shape[0])
                agent_policy.optimize_policy(q_grad, obs_batch)
                parm = agent_critic.get_all_parameters()
                value = np.array(sess.run(agent_critic.get_all_parameters()))
                agent_critic.update_target()
                agent_policy.update_target()
                value = np.array(sess.run(agent_critic.get_all_parameters()))

            # print(time_split)
            print(reward_history[-1])
            summary = sess.run(merged)
            train_writer.add_summary(summary, i)
            time = 200
            episode_reward = 0

コード例 #15

ファイルを表示

ファイル: agent.py プロジェクト: danielpalen/pysc2-rl-agents

    def __init__(self, policy, args):

        network_data_format = 'NHWC' if args.nhwc else 'NCHW'
        value_loss_weight = args.value_loss_weight
        entropy_weight = args.entropy_weight
        learning_rate = args.lr
        max_to_keep = args.max_to_keep
        nenvs = args.envs
        nsteps = args.steps_per_batch
        res = args.res
        checkpoint_path = args.ckpt_path
        summary_writer = args.summary_writer
        debug = args.debug
        debug_tb_adress = args.tensorboard_debug_address

        print('\n### A2C Agent #######')
        print(f'# policy = {policy}')
        print(f'# network_data_format = {network_data_format}')
        print(f'# value_loss_weight = {value_loss_weight}')
        print(f'# entropy_weight = {entropy_weight}')
        print(f'# learning_rate = {learning_rate}')
        print(f'# max_to_keep = {max_to_keep}')
        print(f'# nenvs = {nenvs}')
        print(f'# nsteps = {nsteps}')
        print(f'# res = {res}')
        print(f'# checkpoint_path = {checkpoint_path}')
        print(f'# debug = {debug}')
        print(f'# debug_tb_adress = {debug_tb_adress}')
        print('######################\n')

        max_gradient_norm = 1.0

        tf.reset_default_graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

        if debug and debug_tb_adress:
            raise ValueError(
                "The --debug and --tensorboard_debug_address flags are mutually "
                "exclusive.")
        if debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        elif debug_tb_adress:
            sess = tf_debug.TensorBoardDebugWrapperSession(
                sess, debug_tb_adress)

        nbatch = nenvs * nsteps
        ch = get_input_channels()
        ob_space = {
            'screen': [None, res, res, ch['screen']],
            'minimap': [None, res, res, ch['minimap']],
            'flat': [None, ch['flat']],
            'available_actions': [None, ch['available_actions']]
        }

        step_model = policy(sess,
                            ob_space=ob_space,
                            nbatch=nenvs,
                            nsteps=1,
                            reuse=None,
                            data_format=network_data_format)
        train_model = policy(sess,
                             ob_space=ob_space,
                             nbatch=nbatch,
                             nsteps=nsteps,
                             reuse=True,
                             data_format=network_data_format)

        # Define placeholders
        fn_id = tf.placeholder(tf.int32, [None], name='fn_id')
        arg_ids = {
            k: tf.placeholder(tf.int32, [None], name='arg_{}_id'.format(k.id))
            for k in train_model.policy[1].keys()
        }
        ACTIONS = (fn_id, arg_ids)
        ADVS = tf.placeholder(tf.float32, [None], name='adv')
        RETURNS = tf.placeholder(tf.float32, [None], name='returns')

        # Define Loss
        log_probs = compute_policy_log_probs(train_model.AV_ACTS,
                                             train_model.policy, ACTIONS)
        policy_loss = -tf.reduce_mean(ADVS * log_probs)
        value_loss = tf.reduce_mean(
            tf.square(RETURNS - train_model.value) / 2.)
        entropy = compute_policy_entropy(train_model.AV_ACTS,
                                         train_model.policy, ACTIONS)
        loss = policy_loss + value_loss * value_loss_weight - entropy * entropy_weight

        # Define Optimizer
        global_step = tf.Variable(0, trainable=False)
        learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                                   10000, 0.94)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                              decay=0.99,
                                              epsilon=1e-5)
        train_op = layers.optimize_loss(loss=loss,
                                        global_step=global_step,
                                        optimizer=optimizer,
                                        clip_gradients=max_gradient_norm,
                                        learning_rate=None,
                                        name="train_op")

        tf.summary.scalar('entropy', entropy)
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('loss/policy', policy_loss)
        tf.summary.scalar('loss/value', value_loss)
        tf.summary.scalar('rl/value', tf.reduce_mean(train_model.value))
        tf.summary.scalar('rl/returns', tf.reduce_mean(RETURNS))
        tf.summary.scalar('rl/advs', tf.reduce_mean(ADVS))
        summary_writer.add_graph(sess.graph)
        variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        saver = tf.train.Saver(variables, max_to_keep=max_to_keep)
        train_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
        train_summary_op = tf.summary.merge(train_summaries)

        # Load checkpoints if exist
        if os.path.exists(checkpoint_path):
            ckpt = tf.train.get_checkpoint_state(checkpoint_path)
            self.train_step = int(ckpt.model_checkpoint_path.split('-')[-1])
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("Loaded agent at episode {} (step {})".format(
                self.train_step // nsteps, self.train_step))
        else:
            self.train_step = 0
            sess.run(tf.variables_initializer(variables))

        def train(obs, states, actions, returns, advs, summary=False):
            """
            Args:
              obs: dict of preprocessed observation arrays, with num_batch elements
                in the first dimensions.
              actions: see `compute_total_log_probs`.
              returns: array of shape [num_batch].
              advs: array of shape [num_batch].
              summary: Whether to return a summary.

            Returns:
              summary: (agent_step, loss, Summary) or None.
            """
            feed_dict = {
                train_model.SCREEN: obs['screen'],
                train_model.MINIMAP: obs['minimap'],
                train_model.FLAT: obs['flat'],
                train_model.AV_ACTS: obs['available_actions'],
                RETURNS: returns,
                ADVS: advs,
                ACTIONS[0]: actions[0]
            }
            feed_dict.update({v: actions[1][k] for k, v in ACTIONS[1].items()})
            if states is not None:  # For recurrent polices
                feed_dict.update({train_model.STATES: states})

            agent_step = self.train_step
            self.train_step += 1

            if summary:
                _, _step, _loss, _summary = sess.run(
                    [train_op, global_step, loss, train_summary_op],
                    feed_dict=feed_dict)
                return _step, _loss, _summary
            else:
                sess.run([train_op, loss], feed_dict=feed_dict)

        def save(path, step=None):
            os.makedirs(path, exist_ok=True)
            print("Saving agent to %s, step %d" %
                  (path, sess.run(global_step)))
            ckpt_path = os.path.join(path, 'model.ckpt')
            saver.save(sess, ckpt_path, global_step=global_step)

        def get_global_step():
            return sess.run(global_step)

        self.train = train
        self.step = step_model.step
        self.get_value = step_model.get_value
        self.save = save
        self.initial_state = step_model.initial_state
        self.get_global_step = get_global_step

コード例 #16

ファイルを表示

ファイル: base_model_bert.py プロジェクト: ShiKe-And-His-Friends/tBERT

def model(data_dict, opt, logfile=None, print_dim=False):
    """
    Creates and executes Tensorflow graph for BERT-based models

    Arguments:
    data_dict -- contains all necessary data for model

    opt -- option log, contains learning_rate, num_epochs, minibatch_size, ...
    logfile -- path of file to save opt and results
    print_dim -- print dimensions for debugging purposes

    Returns:
    opt -- updated option log
    parameters -- trained parameters of model
    """

    #####
    # Read options, set defaults and update log
    #####

    try:
        # check input options
        print(opt)
        test_opt(opt)
        if opt.get('git', None) is None:
            add_git_version(opt)  # keep track of git SHA

        # assign variables
        opt['model'] = opt.get('model', 'bert')
        assert 'bert' in opt['model']
        learning_rate = opt['learning_rate'] = opt.get(
            'learning_rate',
            5e-5)  # small learning rate for pretrained BERT layers
        speedup_new_layers = opt['speedup_new_layers'] = opt.get(
            'speedup_new_layers', False)
        freeze_thaw_tune = opt['freeze_thaw_tune'] = opt.get(
            'freeze_thaw_tune', False)
        layer_specific_lr = speedup_new_layers or freeze_thaw_tune
        num_epochs = opt.get('num_epochs', None)  # get num of planned epochs
        opt['num_epochs'] = 0  # use this to keep track of finished epochs
        minibatch_size = opt['minibatch_size'] = opt.get('minibatch_size', 64)
        bert_embd = True
        bert_update = opt['bert_update'] = opt.get('bert_update', False)
        bert_large = opt['bert_large'] = opt.get('bert_large', False)
        cased = opt['bert_cased'] = opt.get('bert_cased', False)
        starter_seed = opt['seed'] = opt.get('seed', None)
        if not type(starter_seed) == int:
            assert starter_seed == None
        # layers = opt['layers'] = opt.get('layers', 1)
        hidden_layer = opt['hidden_layer'] = opt.get(
            'hidden_layer', 0)  # add hidden layer before softmax layer?
        assert hidden_layer in [0, 1, 2]
        topic_encoder = opt['topic_encoder'] = opt.get('topic_encoder', None)
        L_R_unk = opt.get('unk_sub', False)
        assert L_R_unk is False
        # assert encoder in ['word', 'ffn', 'cnn', 'lstm', 'bilstm', 'word+cnn', 'word+ffn', 'word+lstm', 'word+bilstm']
        assert topic_encoder in [None, 'ffn', 'cnn', 'lstm', 'bilstm']
        optimizer_choice = opt['optimizer'] = opt.get(
            'optimizer', 'Adadelta')  # which optimiser to use?
        assert optimizer_choice in ['Adam', 'Adadelta']
        epsilon = opt['epsilon'] = opt.get('epsilon', 1e-08)
        rho = opt['rho'] = opt.get('rho', 0.95)
        L2 = opt['L2'] = opt.get('L2', 0)  # L2 regularisation
        dropout = opt['dropout'] = opt.get('dropout', 0)
        assert not (
            L2 > 0 and dropout > 0
        ), 'Use dropout or L2 regularisation, not both. Current settings: L2={}, dropout={}.'.format(
            L2, dropout)
        sparse = opt['sparse_labels'] = opt.get(
            'sparse_labels', True)  # are labels encoded as sparse?
        save_checkpoints = opt.get('checkpoints',
                                   False)  # save all checkpoints?
        stopping_criterion = opt['stopping_criterion'] = opt.get(
            'stopping_criterion',
            None)  # which metric should be used as early stopping criterion?
        assert stopping_criterion in [None, 'cost', 'MAP', 'F1', 'Accuracy']
        if stopping_criterion is None and num_epochs is None:
            raise ValueError(
                'Invalid parameter combination. Stopping criterion and number of epochs cannot both be None.'
            )
        early_stopping = stopping_criterion in [
            'F1', 'cost', 'MAP', 'Accuracy'
        ]
        predict_every_epoch = opt['predict_every_epoch'] = opt.get(
            'predict_every_epoch', False)
        reduction_factor = opt['hidden_reduce'] = opt.get('hidden_reduce', 2)
        patience = opt['patience'] = opt.get('patience', 20)
        # topic models
        topic_scope = opt['topic'] = opt.get('topic', '')
        if opt['model'] == 'bert_simple_topic':
            assert topic_scope in ['word', 'doc']
        elif opt['model'] == 'bert':
            topic_scope = ''
        else:
            raise NotImplementedError()
        module_name = "src.models.forward.{}".format(opt['model'])
        model = importlib.import_module(module_name)
        if 'word' in topic_scope:
            topic_update = opt['topic_update'] = opt.get(
                'topic_update', False)  # None for backward compatibility
        num_topics = opt['num_topics'] = opt.get('num_topics', None)
        topic_type = opt['topic_type'] = opt.get('topic_type', None)
        if not topic_scope == '':
            assert 'topic' in opt['model']
            assert num_topics > 1
            assert topic_type in ['LDA', 'ldamallet', 'gsdmm']
            opt['topic_alpha'] = opt.get('topic_alpha', 50)
        else:
            assert num_topics is None
            assert topic_type is None
        if opt['dataset'] == 'Quora' and opt['subsets'] == [
                'train', 'dev', 'test', 'p_test'
        ]:
            extra_test = True
        else:
            extra_test = False
        injection_location = opt['injection_location'] = opt.get(
            'injection_location', None)
        if 'inject' in opt['model']:
            assert str(injection_location) in [
                'embd', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
                '11'
            ]
        else:
            assert injection_location is None
        # gpu settings
        gpu = opt.get('gpu', -1)

        # general settings
        session_config = tf.ConfigProto()
        if not gpu == -1:
            print('Running on GPU: {}'.format(gpu))
            os.environ["CUDA_VISIBLE_DEVICES"] = str(
                gpu)  # specifies which GPU to use (if multiple are available)

        ops.reset_default_graph(
        )  # to be able to rerun the model without overwriting tf variables

        if not starter_seed == None:
            random.seed(starter_seed
                        )  # use starter seed to set seed for random library
        seed_list = [random.randint(1, 100000) for i in range(100)
                     ]  # generate list of seeds to be used in the model

        np.random.seed(seed_list.pop(0))
        tf.set_random_seed(
            seed_list.pop(0))  # set tensorflow seed to keep results consistent

        #####
        # unpack data and assign to model variables
        #####

        assert data_dict.get('embd', None) is None  # (565852, 200)

        if 'word' in topic_scope:
            topic_embd = data_dict['word_topics'].get('topic_matrix',
                                                      None)  #topic_emb.shape

        # assign word ids
        if extra_test:
            ID1_train, ID1_dev, ID1_test, ID1_test_extra = data_dict['ID1']
            ID2_train, ID2_dev, ID2_test, ID2_test_extra = data_dict['ID2']
        else:
            ID1_train, ID1_dev, ID1_test = data_dict['ID1']
            ID2_train, ID2_dev, ID2_test = data_dict['ID2']
        train_dict, dev_dict, test_dict, test_dict_extra = extract_data(
            data_dict, topic_scope, extra_test)

        #####
        # check input dimensions
        #####

        if sparse:
            classes = 2
        else:
            classes = train_dict['Y'].shape[1]
        (m, sentence_length_1) = train_dict['E1'].shape
        # (m, sentence_length_2) = train_dict['E2'].shape

        #####
        # Define Tensorflow graph
        #####

        # Create Placeholders and initialise weights of the correct shape
        X1, X1_mask, X1_seg, Y = create_placeholders([sentence_length_1, None],
                                                     classes,
                                                     bicnn=True,
                                                     sparse=sparse,
                                                     bert=bert_embd)

        # Create topic placeholders
        print('Topic scope: {}'.format(topic_scope))
        if 'doc' in topic_scope:
            D_T1, D_T2 = create_doc_topic_placeholders(num_topics)
        else:
            D_T1, D_T2 = None, None
        if 'word' in topic_scope:
            W_T_embedded = None
            (m, sentence_length_1) = train_dict['W_T1'].shape
            (m, sentence_length_2) = train_dict['W_T2'].shape
            W_T1, W_T2 = create_word_topic_placeholders(
                [sentence_length_1, sentence_length_2])
        else:
            W_T1_embedded, W_T2_embedded, W_T_embedded = None, None, None

        # tensors for feed_dict
        bert_inputs = dict(input_ids=X1,
                           input_mask=X1_mask,
                           segment_ids=X1_seg)
        maybe_print([X1], ['input ids'], True)

        dropout_prob = tf.placeholder_with_default(0.0,
                                                   name='dropout_rate',
                                                   shape=())

        # load and lookup BERT
        BERT_version = get_bert_version(cased, bert_large)
        BERT_URL = 'https://tfhub.dev/google/bert_{}/1'.format(BERT_version)
        print('Loading pretrained model from {}'.format(BERT_URL))
        bert_lookup = hub.Module(BERT_URL,
                                 name='bert_lookup',
                                 trainable=bert_update)
        X_embedded = bert_lookup(
            bert_inputs, signature="tokens", as_dict=True
        )  # important to use tf. 1.11 as tf 1.7 will produce error for sess.run(X_embedded)
        # X_embedded has 2 keys:
        # pooled_output is [batch_size, hidden_size] -->output embedding for each token
        # sequence_output is [batch_size, sequence_length, hidden_size] -->output embedding for the entire sequence

        # Create topic embedding matrix
        if 'word' in topic_scope:
            topic_vocabulary_size, topic_dim = topic_embd.shape
            # assert(topic_vocabulary_size==embd_vocabulary_size) # currently using the same id to index topic and embd matrix
            topic_embedding_matrix = initialise_pretrained_embedding(
                topic_vocabulary_size,
                topic_dim,
                topic_embd,
                name='word_topics',
                trainable=topic_update)
            # Lookup topic embedding
            W_T1_embedded = lookup_embedding(W_T1,
                                             topic_embedding_matrix,
                                             expand=False,
                                             transpose=False,
                                             name='topic_lookup_L')
            W_T2_embedded = lookup_embedding(W_T2,
                                             topic_embedding_matrix,
                                             expand=False,
                                             transpose=False,
                                             name='topic_lookup_R')

        # Forward propagation: Build forward propagation as tensorflow graph
        input_dict = {
            'E1': X_embedded,
            'E2': None,
            'D_T1': D_T1,
            'D_T2': D_T2,
            'W_T1': W_T1_embedded,
            'W_T2': W_T2_embedded,
            'W_T': W_T_embedded
        }
        forward_pass = model.forward_propagation(input_dict, classes,
                                                 hidden_layer,
                                                 reduction_factor,
                                                 dropout_prob, seed_list,
                                                 print_dim)
        logits = forward_pass['logits']

        with tf.name_scope('cost'):
            # Cost function: Add cost function to tensorflow graph
            main_cost = compute_cost(logits, Y, loss_fn='bert')
            cross_entropy_scalar = tf.summary.scalar('cross_entropy',
                                                     main_cost)
            cost = main_cost
            cost_summary = tf.summary.merge([cross_entropy_scalar])

        # Backpropagation: choose training regime (creates tensorflow optimizer which minimizes the cost).

        if layer_specific_lr:
            learning_rate_old_layers = tf.placeholder_with_default(
                0.0, name='learning_rate_old', shape=())
            learning_rate_new_layers = tf.placeholder_with_default(
                0.0, name='learning_rate_new', shape=())
            train_step = layer_specific_regime(optimizer_choice, cost,
                                               learning_rate_old_layers,
                                               learning_rate_new_layers,
                                               epsilon, rho)
        else:
            learning_rate_old_layers = tf.placeholder_with_default(
                0.0, name='learning_rate', shape=())
            learning_rate_new_layers = None
            train_step = standard_training_regime(optimizer_choice, cost,
                                                  learning_rate_old_layers,
                                                  epsilon, rho)

        # Prediction and Evaluation tensors

        with tf.name_scope('evaluation_metrics'):
            predicted_label = tf.argmax(
                logits, 1, name='predict'
            )  # which column is the one with the highest activation value?
            if sparse:
                actual_label = Y
            else:
                actual_label = tf.argmax(Y, 1)
            conf_scores = get_confidence_scores(logits, False)
            maybe_print(
                [predicted_label, actual_label, conf_scores],
                ['Predicted label', 'Actual label', 'Confidence Scores'],
                False)

            # create streaming metrics: http://ronny.rest/blog/post_2017_09_11_tf_metrics/
            streaming_accuracy, streaming_accuracy_update = tf.metrics.accuracy(
                labels=actual_label, predictions=predicted_label)
            label_idx = tf.expand_dims(tf.where(tf.not_equal(Y, 0))[:, 0],
                                       0,
                                       name='label_idx')
            rank_scores = tf.expand_dims(get_confidence_scores(logits),
                                         0,
                                         name='rank_scores')
            maybe_print([label_idx, rank_scores],
                        ['Label index', 'Rank scores'], False)
            streaming_map, streaming_map_update = tf.metrics.average_precision_at_k(
                label_idx, rank_scores, 10)
            # fixed NaN for examples without relevant docs by editing .virtualenvs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/metrics_impl.py line 2796
            # return math_ops.div(precision_sum, num_relevant_items + 1e-11, name=scope)
            streaming_recall, streaming_recall_update = tf.contrib.metrics.streaming_recall(
                predictions=predicted_label, labels=actual_label)
            streaming_precision, streaming_precision_update = tf.contrib.metrics.streaming_precision(
                predictions=predicted_label, labels=actual_label)
            eps = 1e-11  # prevent division by zero
            streaming_f1 = 2 * (streaming_precision * streaming_recall) / (
                streaming_precision + streaming_recall + eps)
            # create and merge summaries
            accuracy_scalar = tf.summary.scalar('Accuracy', streaming_accuracy)
            recall_scalar = tf.summary.scalar('Recall', streaming_recall)
            precision_scalar = tf.summary.scalar('Precision',
                                                 streaming_precision)
            f1_scalar = tf.summary.scalar('F1', streaming_f1)
            map_scalar = tf.summary.scalar('MAP', streaming_map)
            eval_summary = tf.summary.merge([
                accuracy_scalar, recall_scalar, precision_scalar, f1_scalar,
                map_scalar
            ])

        def predict(sess, subset, writer, epoch, ignore_MAP, topic_scope,
                    layer_specific_lr):
            '''
            Predict in minibatch loop to prevent out of memory error (for large datasets or complex models)
            :param input_X1: document 1
            :param input_X2: document 2
            :param input_T1: topic distributions for document 1 or None
            :param input_T2: topic distributions for document 2 or None
            :param input_Y: labels
            :param writer:
            :param epoch:
            :param ignore_MAP:
            :return: complete prediction results as list [confidence_scores, predictions, minibatch_cost, eval_metrics]
            '''
            # print(input_T1)
            predictions = []
            confidence_scores = []
            minibatch_size = 10
            minibatches = create_minibatches(subset,
                                             minibatch_size,
                                             sparse=sparse,
                                             random=False,
                                             topic_scope=topic_scope)
            sess.run(tf.local_variables_initializer())  # for streaming metrics

            for minibatch in minibatches:
                feed_dict = {
                    X1: minibatch['E1'],
                    X1_mask: minibatch['E1_mask'],
                    X1_seg: minibatch['E1_seg'],
                    # X2: minibatch['E2'],
                    Y: minibatch['Y'],
                    learning_rate_old_layers: 0,
                    dropout_prob: 0
                }  # don't use dropout during prediction
                if layer_specific_lr:
                    feed_dict[learning_rate_new_layers] = 0
                if 'doc' in topic_scope:
                    feed_dict[D_T1] = minibatch['D_T1']
                    feed_dict[D_T2] = minibatch['D_T2']
                if 'word' in topic_scope:
                    feed_dict[W_T1] = minibatch['W_T1']
                    feed_dict[W_T2] = minibatch['W_T2']
                # Run the session to execute the prediction and evaluation, the feedict should contain a minibatch for (X,Y).
                pred, conf = sess.run(  # evaluating merged_summary will mess up streaming metrics
                    [predicted_label, conf_scores],
                    feed_dict=feed_dict)
                predictions.extend(pred)
                confidence_scores.extend(conf)

            if not ignore_MAP:
                eval_metrics = [None, None, None, None, None]
            else:
                eval_metrics = [None, None, None, None]
            return confidence_scores, predictions, None, eval_metrics

        def predict_eval(sess, subset, writer, epoch, ignore_MAP, topic_scope,
                         layer_specific_lr):
            '''
            Predict in minibatch loop to prevent out of memory error (for large datasets or complex models)
            :param input_X1: document 1
            :param input_X2: document 2
            :param input_T1: topic distributions for document 1 or None
            :param input_T2: topic distributions for document 2 or None
            :param input_Y: labels
            :param writer:
            :param epoch:
            :param ignore_MAP:
            :return: complete prediction results as list [confidence_scores, predictions, minibatch_cost, eval_metrics]
            '''
            # print(input_T1)
            predictions = []
            confidence_scores = []
            minibatch_size = 10
            minibatches = create_minibatches(subset,
                                             minibatch_size,
                                             sparse=sparse,
                                             random=False,
                                             topic_scope=topic_scope)
            sess.run(tf.local_variables_initializer())  # for streaming metrics

            minibatch_cost = 0.
            num_minibatches = int(
                m / minibatch_size
            )  # number of minibatches of size minibatch_size in the train set

            for minibatch in minibatches:
                feed_dict = {
                    X1: minibatch['E1'],
                    X1_mask: minibatch['E1_mask'],
                    X1_seg: minibatch['E1_seg'],
                    # X2: minibatch['E2'],
                    Y: minibatch['Y'],
                    learning_rate_old_layers: 0,
                    dropout_prob: 0
                }  # don't use dropout during prediction
                if layer_specific_lr:
                    feed_dict[learning_rate_new_layers] = 0
                if 'doc' in topic_scope:
                    feed_dict[D_T1] = minibatch['D_T1']
                    feed_dict[D_T2] = minibatch['D_T2']
                if 'word' in topic_scope:
                    feed_dict[W_T1] = minibatch['W_T1']
                    feed_dict[W_T2] = minibatch['W_T2']
                # Run the session to execute the prediction and evaluation, the feeddict should contain a minibatch for (X,Y).
                if not ignore_MAP:
                    # print('with MAP')
                    pred, conf, batch_cost, c, _, _, _, _ = sess.run(  # merged_summary will mess up streaming metrics!
                        [
                            predicted_label, conf_scores, cost, cost_summary,
                            streaming_accuracy_update, streaming_recall_update,
                            streaming_precision_update, streaming_map_update
                        ],
                        feed_dict=feed_dict)
                else:
                    # print('without MAP')
                    pred, conf, batch_cost, c, _, _, _ = sess.run(  # merged_summary will mess up streaming metrics!
                        [
                            predicted_label, conf_scores, cost, cost_summary,
                            streaming_accuracy_update, streaming_recall_update,
                            streaming_precision_update
                        ],
                        feed_dict=feed_dict)
                predictions.extend(pred)
                confidence_scores.extend(conf)
                writer.add_summary(c, epoch)
                minibatch_cost += batch_cost / num_minibatches

            if not ignore_MAP:
                eval, acc, rec, prec, f_1, ma_p = sess.run([
                    eval_summary, streaming_accuracy, streaming_recall,
                    streaming_precision, streaming_f1, streaming_map
                ])
                eval_metrics = [acc, prec, rec, f_1, ma_p]
            else:
                eval, acc, rec, prec, f_1 = sess.run([
                    eval_summary, streaming_accuracy, streaming_recall,
                    streaming_precision, streaming_f1
                ])
                eval_metrics = [acc, prec, rec, f_1]
            writer.add_summary(eval, epoch)
            return confidence_scores, predictions, minibatch_cost, eval_metrics

        def training_loop(sess,
                          train_dict,
                          dev_dict,
                          test_dict,
                          train_writer,
                          dev_writer,
                          opt,
                          dropout,
                          seed_list,
                          num_epochs,
                          early_stopping,
                          optimizer,
                          lr_bert,
                          lr_new,
                          layer_specific_lr,
                          stopping_criterion='MAP',
                          patience=patience,
                          topic_scope=None,
                          predict_every_epoch=False):
            '''
            Trains the model
            :param X1_train: document 1 (train)
            :param X2_train: document 2 (train)
            :param D_T1_train: topic 1 (train)
            :param D_T2_train: topic 2 (train)
            :param Y_train: labels (train)
            :param X1_dev: document 1 (dev)
            :param X2_dev: document 2 (dev)
            :param D_T1_dev: topic 1 (dev)
            :param D_T2_dev: topic 2 (dev)
            :param Y_dev: labels (dev)
            :param train_writer:
            :param dev_writer:
            :param opt: option dict
            :param dropout:
            :param seed_list:
            :param num_epochs:
            :param early_stopping:
            :param stopping_criterion:
            :param patience:
            :return: [opt, epoch]
            '''
            if predict_every_epoch:
                epoch = opt['num_epochs']
                sess.run(tf.local_variables_initializer())
                _, _, train_cost, train_metrics = predict_eval(
                    sess, train_dict, train_writer, epoch,
                    skip_MAP(train_dict['E1']), topic_scope, layer_specific_lr)
                dev_scores, dev_pred, dev_cost, dev_metrics = predict_eval(
                    sess, dev_dict, dev_writer, epoch,
                    skip_MAP(dev_dict['E1']), topic_scope, layer_specific_lr)
                print('Predicting for epoch {}'.format(epoch))
                test_scores, test_pred, _, test_metrics = predict_eval(
                    sess, test_dict, test_writer, epoch,
                    skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr)
                output_predictions(ID1_dev, ID2_dev, dev_scores, dev_pred,
                                   'dev_{}'.format(epoch), opt)
                opt = save_eval_metrics(
                    dev_metrics, opt, 'dev',
                    'score_{}'.format(epoch))  # log dev metrics
                output_predictions(ID1_test, ID2_test, test_scores, test_pred,
                                   'test_{}'.format(epoch), opt)
                opt = save_eval_metrics(
                    test_metrics, opt, 'test',
                    'score_{}'.format(epoch))  # log test metrics
                write_log_entry(opt, 'data/logs/' + logfile)

            epoch = opt[
                'num_epochs'] + 1  # continue counting after freeze epochs
            best_dev_value = None
            best_dev_round = 0
            ep = 'num_epochs'

            while True:
                print('Epoch {}'.format(epoch))
                minibatch_cost = 0.
                minibatches = create_minibatches(train_dict,
                                                 minibatch_size,
                                                 seed_list.pop(0),
                                                 sparse=sparse,
                                                 random=True,
                                                 topic_scope=topic_scope)

                for minibatch in minibatches:
                    feed_dict = {
                        X1: minibatch['E1'],
                        X1_mask: minibatch['E1_mask'],
                        X1_seg: minibatch['E1_seg'],
                        # X2: minibatch['E2'],
                        Y: minibatch['Y'],
                        learning_rate_old_layers: lr_bert,
                        dropout_prob: dropout
                    }
                    if layer_specific_lr:
                        feed_dict[learning_rate_new_layers] = lr_new
                    # print(minibatch.keys())
                    if 'doc' in topic_scope:
                        feed_dict[D_T1] = minibatch['D_T1']
                        feed_dict[D_T2] = minibatch['D_T2']
                    if 'word' in topic_scope:
                        feed_dict[W_T1] = minibatch['W_T1']
                        feed_dict[W_T2] = minibatch['W_T2']
                    # IMPORTANT: The line that runs the graph on a minibatch.
                    # Run the session to execute the optimizer and the cost, the feedict should contain a minibatch for (X,Y).
                    _, temp_cost = sess.run([optimizer, cost],
                                            feed_dict=feed_dict)

                # write summaries and checkpoints every few epochs
                if not logfile is None:
                    # print("Train cost after epoch %i: %f" % (epoch, minibatch_cost))
                    sess.run(tf.local_variables_initializer())
                    _, _, train_cost, train_metrics = predict_eval(
                        sess, train_dict, train_writer, epoch,
                        skip_MAP(train_dict['E1']), topic_scope,
                        layer_specific_lr)
                    dev_scores, dev_pred, dev_cost, dev_metrics = predict_eval(
                        sess, dev_dict, dev_writer, epoch,
                        skip_MAP(dev_dict['E1']), topic_scope,
                        layer_specific_lr)
                    if predict_every_epoch and (not epoch == num_epochs):
                        print('Predicting for epoch {}'.format(epoch))
                        test_scores, test_pred, _, test_metrics = predict_eval(
                            sess, test_dict, test_writer, epoch,
                            skip_MAP(test_dict['E1']), topic_scope,
                            layer_specific_lr)
                        output_predictions(ID1_dev, ID2_dev, dev_scores,
                                           dev_pred, 'dev_{}'.format(epoch),
                                           opt)
                        opt = save_eval_metrics(
                            dev_metrics, opt, 'dev',
                            'score_{}'.format(epoch))  # log dev metrics
                        output_predictions(ID1_test, ID2_test, test_scores,
                                           test_pred, 'test_{}'.format(epoch),
                                           opt)
                        opt = save_eval_metrics(
                            test_metrics, opt, 'test',
                            'score_{}'.format(epoch))  # log test metrics
                        write_log_entry(opt, 'data/logs/' + logfile)
                    # dev_metrics = [acc, prec, rec, f_1, ma_p]
                    # use cost or other metric as early stopping criterion
                    if stopping_criterion == 'cost':
                        stopping_metric = dev_cost
                        print("Dev {} after epoch {}: {}".format(
                            stopping_criterion, epoch, stopping_metric))
                    elif stopping_criterion == 'MAP':
                        assert len(dev_metrics
                                   ) == 5  # X1_dev must have 10 * x examples
                        current_result = dev_metrics[-1]  # MAP
                        print("Dev {} after epoch {}: {}".format(
                            stopping_criterion, epoch, current_result))
                        stopping_metric = 1 - current_result  # dev error
                    elif stopping_criterion == 'F1':
                        current_result = dev_metrics[3]  # F1
                        print("Dev {} after epoch {}: {}".format(
                            stopping_criterion, epoch, current_result))
                        stopping_metric = 1 - current_result  # dev error
                    elif stopping_criterion == 'Accuracy':
                        current_result = dev_metrics[0]  # Accuracy
                        print("Dev {} after epoch {}: {}".format(
                            stopping_criterion, epoch, current_result))
                        stopping_metric = 1 - current_result  # dev error
                    if early_stopping:
                        # save checkpoint for first or better models
                        if (best_dev_value is None) or (stopping_metric <
                                                        best_dev_value):
                            best_dev_value = stopping_metric
                            best_dev_round = epoch
                            save_model(opt, saver, sess, epoch)  # save model
                            opt = save_eval_metrics(
                                train_metrics, opt,
                                'train')  # update train metrics in log
                # check stopping criteria
                # stop training if predefined number of epochs reached
                if (not early_stopping) and (epoch == num_epochs):
                    print('Reached predefined number of training epochs.')
                    save_model(opt, saver, sess, epoch)  # save model
                    break
                if early_stopping and (epoch == num_epochs):
                    print(
                        'Maximum number of epochs reached during early stopping.'
                    )
                    break
                # stop training if early stopping criterion reached
                if early_stopping and epoch >= best_dev_round + patience:
                    print(
                        'Early stopping criterion reached after training for {} epochs.'
                        .format(epoch))
                    break
                # stop training if gradient is vanishing
                if math.isnan(minibatch_cost):
                    print('Cost is Nan at epoch {}!'.format(epoch))
                    break

                epoch += 1

            print('Finished training.')

            # restore weights from saved model in best epoch
            if early_stopping:
                print('Load best model from epoch {}'.format(best_dev_round))
                opt[ep] = best_dev_round
                epoch = best_dev_round  # log final predictions with correct epoch info
                load_model(opt, saver, sess,
                           best_dev_round)  # ToDo: fix Too many open files
                # clean up previous checkpoints to save space
                delete_all_checkpoints_but_best(opt, best_dev_round)
            else:
                opt[ep] = epoch
                opt = save_eval_metrics(train_metrics, opt,
                                        'train')  # log train metrics

            return opt, epoch

        # Initialize all the variables globally
        init = tf.global_variables_initializer()
        if (not logfile is None) or (early_stopping):
            saver = create_saver()
        start_time, opt = start_timer(opt, logfile)
        print('Model {}'.format(opt['id']))

        #####
        # Start session to execute Tensorflow graph
        #####

        with tf.Session(
                config=session_config
        ) as sess:  #config=tf.ConfigProto(log_device_placement=True)

            # add debugger (but not for batch experiments)
            if __name__ == '__main__' and FLAGS.debug:
                sess = tf_debug.TensorBoardDebugWrapperSession(
                    sess, "NPMacBook.local:7000")

            # Run the initialization
            sess.run(init)

            if logfile is None:
                train_writer = None
                dev_writer = None
                test_writer = None
                if extra_test:
                    test_writer_extra = None
            else:
                print('logfile: {}'.format(logfile))
                create_model_folder(opt)
                model_dir = get_model_dir(opt)
                train_writer = tf.summary.FileWriter(
                    model_dir + '/train', sess.graph)  # save graph first
                dev_writer = tf.summary.FileWriter(model_dir + '/dev')
                test_writer = tf.summary.FileWriter(model_dir + '/test')
                if extra_test:
                    test_writer_extra = tf.summary.FileWriter(model_dir +
                                                              '/test_extra')

            # additional input for predict every epoch
            if predict_every_epoch:
                td = test_dict
            else:
                td = None

            # set learning rates per layer
            if speedup_new_layers:
                lr_bert = learning_rate
                lr_new = learning_rate * 100
            else:
                lr_bert = learning_rate
                lr_new = learning_rate

            # Freeze BERT and only train new weights
            if freeze_thaw_tune:
                print('Freeze BERT and train new layers...')
                opt, epoch = training_loop(sess, train_dict, dev_dict, td,
                                           train_writer, dev_writer, opt,
                                           dropout, seed_list, num_epochs,
                                           early_stopping, train_step, 0,
                                           lr_new, layer_specific_lr,
                                           stopping_criterion, patience,
                                           topic_scope, predict_every_epoch)
                num_epochs += epoch
                lr_new = learning_rate

            # Normal Finetuning
            print('Finetune...')
            opt, epoch = training_loop(
                sess, train_dict, dev_dict, td, train_writer, dev_writer, opt,
                dropout, seed_list, num_epochs, early_stopping, train_step,
                lr_bert, lr_new, layer_specific_lr, stopping_criterion,
                patience, topic_scope, predict_every_epoch)

            # Predict + evaluate on dev and test set

            # train_scores, train_pred, _, train_metrics = predict(X1_train, X2_train, Y_train, train_writer, epoch)
            dev_scores, dev_pred, _, dev_metrics = predict_eval(
                sess, dev_dict, dev_writer, epoch, skip_MAP(dev_dict['E1']),
                topic_scope, layer_specific_lr)
            opt = save_eval_metrics(dev_metrics, opt, 'dev')
            if opt['dataset'] == 'GlueQuora':
                test_scores, test_pred, _, test_metrics = predict(
                    sess, test_dict, test_writer, epoch,
                    skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr)
            else:
                test_scores, test_pred, _, test_metrics = predict_eval(
                    sess, test_dict, test_writer, epoch,
                    skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr)
            opt = save_eval_metrics(test_metrics, opt, 'test')
            opt = end_timer(opt, start_time, logfile)
            if extra_test:
                test_scores_extra, test_pred_extra, _, test_metrics_extra = predict_eval(
                    sess, test_dict_extra, test_writer_extra, epoch,
                    skip_MAP(test_dict['E1']), topic_scope, layer_specific_lr)
                opt = save_eval_metrics(test_metrics_extra, opt, 'PAWS')

            if print_dim:
                if stopping_criterion is None:
                    stopping_criterion = 'Accuracy'
                print('Dev {}: {}'.format(
                    stopping_criterion,
                    opt['score'][stopping_criterion]['dev']))
                print('Test {}: {}'.format(
                    stopping_criterion,
                    opt['score'][stopping_criterion]['test']))

        if not logfile is None:
            # save log
            write_log_entry(opt, 'data/logs/' + logfile)

            # write predictions to file for scorer
            # output_predictions(ID1_train, ID2_train, train_scores, train_pred, 'train', opt)
            output_predictions(ID1_dev, ID2_dev, dev_scores, dev_pred, 'dev',
                               opt)
            output_predictions(ID1_test, ID2_test, test_scores, test_pred,
                               'test', opt)
            if extra_test:
                output_predictions(ID1_test_extra, ID2_test_extra,
                                   test_scores_extra, test_pred_extra,
                                   'PAWS_test', opt)
            print('Wrote predictions for model_{}.'.format(opt['id']))

            # save model
            if save_checkpoints:
                save_model(opt, saver, sess, epoch)  # save disk space

            # close all writers to prevent too many open files error
            train_writer.close()
            dev_writer.close()
            test_writer.close()
            if extra_test:
                test_writer_extra.close()

    except Exception as e:
        print("Error: {0}".format(e.__doc__))
        traceback.print_exc(file=sys.stdout)
        opt['status'] = 'Error'
        write_log_entry(opt, 'data/logs/' + logfile)

    # print('==============')

    return opt

コード例 #17

ファイルを表示

TRAIN_NUM_ANOMALIES = 1000
TEST_NUM_ANOMALIES = 50
IMG_HGT = 28
IMG_WDT = 28
IMG_DEPTH = 1
nClass = 2
# trainX,trainY = createData.get_MNIST_TrainingData(NUM_NORMAL)
trainX, trainY, train_Anomaly_X, train_Anomaly_Y = createData.get_MNIST_TrainingData(
    NUM_NORMAL, TRAIN_NUM_ANOMALIES)
[test_ones, label_ones, test_sevens,
 label_sevens] = createData.get_MNIST_TestingData(NUM_NORMAL,
                                                  TEST_NUM_ANOMALIES)

from src.models.OC_NN import OC_NN
ocnn = OC_NN()

nu = 0.01
NUM_EPOCHS = 100
# keras.backend.set_session(
#     tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "vlan-2663-10-17-5-224.staff.wireless.sydney.edu.au:7000"))

keras.backend.set_session(
    tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:7000"))

ocnn.fit(trainX, nu, NUM_EPOCHS, IMG_HGT, IMG_WDT, IMG_DEPTH, nClass)
res = ocnn.score(test_ones, test_sevens)
auc_OCNN = res
print("=" * 35)
print("AUC:", res)
print("=" * 35)

コード例 #18

ファイルを表示

def _run_experiment(flags, exp_config):
    """Setup and execute an experiment workflow with specified options."""
    util.set_logging(flags['logging'])

    TbDebug.TB_DEBUG = flags['tb_debug']

    # Get the component's default HParams, then override
    # -------------------------------------------------------------------------
    component_hparams_override = {}

    # Override that if defined using flag
    if flags['hparams_override']:
        if isinstance(flags['hparams_override'], dict):
            component_hparams_override = flags['hparams_override']
        else:
            # Unstringy the string formatted dict
            component_hparams_override = ast.literal_eval(
                flags['hparams_override'])

    # Override hparams for this sweep/run
    if flags['hparams_sweep']:
        # Unstringy the string formatted dict
        hparams_sweep = ast.literal_eval(flags['hparams_sweep'])

        # Selectively override component hparams
        component_hparams_override.update(hparams_sweep)

    # Export settings
    # -------------------------------------------------------------------------
    export_opts = {
        'export_filters': True,
        'export_checkpoint': True,
        'max_to_keep': 5,
        'interval_batches': flags['batches']
    }

    # Classifier settings
    # -------------------------------------------------------------------------
    classifier_opts = {
        'model': 'logistic',  # Options: logistic, svm
        'unit_range': False,  # Set to True if using SVM
        'interval_batches': flags['batches'],
        'hparams': {
            'logistic': {
                'C': [0.01, 0.1, 1.0, 10.0]  # Regularization
            },
            'svm': {
                'C': [1.0, 10.0, 100.0]  # Regularization
            }
        }
    }

    # Checkpoint Options
    # -------------------------------------------------------------------------
    checkpoint_opts = {
        'checkpoint_path': flags['checkpoint'],
        'checkpoint_load_scope': flags['checkpoint_load_scope'],
        'checkpoint_frozen_scope': flags['checkpoint_frozen_scope']
    }

    # OPTIONAL: Override options from an experiment definition file
    # -------------------------------------------------------------------------
    workflow_opts_override = {}

    if exp_config:
        if 'export-options' in exp_config:
            export_opts.update(exp_config['export-options'])
        if 'workflow-options' in exp_config:
            workflow_opts_override.update(exp_config['workflow-options'])
        if 'classifier-options' in exp_config:
            classifier_opts.update(exp_config['classifier-options'])
        if 'checkpoint-options' in exp_config:
            checkpoint_opts.update(exp_config['checkpoint-options'])

    # Override workflow options for this sweep/run
    if flags['workflow_opts_sweep']:
        # Unstringy the string formatted dict
        workflow_opts_sweep = ast.literal_eval(flags['workflow_opts_sweep'])

        # Selectively override component hparams
        workflow_opts_override.update(workflow_opts_sweep)

    # Training with Tensorflow
    # -------------------------------------------------------------------------
    with tf.Graph().as_default():  # pylint: disable=no-context-manager
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True  # pylint: disable=no-member
        session = tf.Session(config=config)

        if TbDebug.TB_DEBUG:
            from tensorflow.python import debug as tf_debug
            session = tf_debug.TensorBoardDebugWrapperSession(
                session, 'localhost:6064')
            print(
                "Use the following command to run Tensorboard Debugger:\n'tensorboard --logdir=./ --debugger_port 6064'"
            )

        util.set_seed(flags['seed'])

        # Load relevant dataset, workflow and component modules
        dataset_class = util.get_module_class_ref(flags['dataset'])
        workflow_class = util.get_module_class_ref(flags['workflow'])
        component_class = util.get_module_class_ref(flags['component'])

        # Override workflow options
        workflow_opts = workflow_class.default_opts()
        workflow_opts.override_from_dict(workflow_opts_override)

        # Log experiment settings
        print('Dataset:', flags['dataset'])
        print('Workflow:', flags['workflow'])
        print('Component:', flags['component'], '\n')

        print('Export Options:', json.dumps(export_opts, indent=4))
        print('Workflow Options:', json.dumps(workflow_opts.values(),
                                              indent=4))
        print('Classifier Options:', json.dumps(classifier_opts, indent=4))
        print('Checkpoint Options:', json.dumps(checkpoint_opts, indent=4),
              '\n')

        # Setup Experiment Workflow
        # -------------------------------------------------------------------------
        workflow = workflow_class(session,
                                  dataset_class,
                                  flags['dataset_location'],
                                  component_class,
                                  component_hparams_override,
                                  classifier_opts,
                                  export_opts,
                                  opts=workflow_opts,
                                  summarize=flags['summarize'],
                                  seed=flags['seed'],
                                  summary_dir=flags['summary_dir'],
                                  checkpoint_opts=checkpoint_opts)
        workflow.setup()

        # Start experiment to train the model and evaluating every N batches
        # -------------------------------------------------------------------------
        workflow.run(flags['batches'],
                     evaluate=workflow_opts.evaluate,
                     train=workflow_opts.train)

        session.close()

コード例 #19

ファイルを表示

# Laden der Daten
train_data, train_labels, eval_data, eval_labels = load_fashion_data()
train_data = train_data.reshape(-1, 28, 28, 1)
train_labels = np_utils.to_categorical(train_labels, 10)

print(train_data.shape)

# Model mit Keras
model.add(InputLayer(input_shape=(28, 28,1),name="1_Eingabe"))
model.add(Conv2D(32,(2, 2),padding='same',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="2_Conv2D"))
model.add(Activation(activation='relu',name="3_ReLu"))
model.add(MaxPool2D(padding='same',name="4_MaxPooling2D"))
model.add(Conv2D(32,(2, 2),padding='same',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="5_Conv2D"))
model.add(Activation(activation='relu',name="6_ReLu"))
model.add(MaxPool2D(padding='same',name="7_MaxPooling2D"))
model.add(Flatten())
model.add(Dense(1024,activation='relu',bias_initializer=Constant(0.01),kernel_initializer='random_uniform',name="8_Dense"))
model.add(Dropout(0.4,name="9_Dense"))
model.add(Dense(10, activation='softmax',name="10_Ausgabe"))

model.compile(loss=losses.categorical_crossentropy, optimizer=optimizers.Adadelta(), metrics = ["accuracy","mse",metrics.categorical_accuracy])

#keras.backend.set_session(tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:12345"))
K.set_session(tf_debug.TensorBoardDebugWrapperSession(tf.Session(), "localhost:12345"))

history = model.fit(train_data,train_labels, batch_size=64, epochs=100, verbose=1,validation_split=0.33)

# Optionale Ausgabe:
#plt.plot(history.history['val_loss'], 'r', history.history['val_acc'], 'b')
#plt.show()

コード例 #20

ファイルを表示

def main(_):

    ##############下面，除了图片，所有坐标都是归一化坐标##################
    # 输入图片 [批数，高，宽，通道数], 已经去均值
    input_images = tf.placeholder(dtype=tf.float32,
                                  shape=(1, input_shape[0], input_shape[1], 3),
                                  name='input_images')
    # ground truth， [批数，MAX_GT_INSTANCES，4]
    gt_boxes = tf.placeholder(dtype=tf.float32,
                              shape=(1, None, 4),
                              name='gt_boxes')
    # 类别编号 [批数，MAX_GT_INSTANCES]
    class_ids = tf.placeholder(dtype=tf.int32,
                               shape=(1, None),
                               name='class_ids')
    # MASK，[批数，MAX_GT_INSTANCES， 高，宽]，每个GT都要有一个标签，一个mask
    input_gt_mask = tf.placeholder(dtype=tf.bool,
                                   shape=(1, None, None, None),
                                   name='input_gt_mask')
    # 真实的anchor标签，[批数，anchor个数]，其中1表示正例，0表示负例，-1表示不予考虑
    rpn_binary_gt = tf.placeholder(dtype=tf.int32,
                                   shape=(1, None),
                                   name='rpn_binary_gt')
    # anchor与gt之间的回归差异，[批数，anchor个数，(dx, dy, log(h), log(w))]
    anchor_deltas = tf.placeholder(dtype=tf.float32,
                                   shape=(1, None, 4),
                                   name='anchor_deltas')

    if not tf.gfile.Exists(config.checkpoint_path):
        tf.gfile.MakeDirs(config.checkpoint_path)
    else:
        if not config.restore:
            tf.gfile.DeleteRecursively(config.checkpoint_path)
            tf.gfile.MakeDirs(config.checkpoint_path)

    global_step = tf.get_variable('global_step', [],
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)
    learning_rate = tf.train.exponential_decay(config.learning_rate,
                                               global_step,
                                               decay_steps=10000,
                                               decay_rate=0.94,
                                               staircase=True)
    tf.summary.scalar('lr', learning_rate)
    opt = tf.train.AdamOptimizer(learning_rate)

    mask_rcnn = MASK_RCNN()
    #  mode, input_image, gt_boxes=None, class_ids=None,
    #                 input_gt_mask=None, anchor_labels=None,anchor_deltas=None
    rpn_loss, proposal_loss, mask_loss, model_loss = mask_rcnn.build_model(
        'training', input_images, gt_boxes, class_ids, input_gt_mask,
        rpn_binary_gt, anchor_deltas)
    total_loss = model_loss + tf.add_n(
        tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    # total_loss = model_loss +
    batch_norm_updates_op = tf.group(
        *tf.get_collection(tf.GraphKeys.UPDATE_OPS))
    with_clip = True
    if with_clip:
        tvars = tf.trainable_variables()
        grads, norm = tf.clip_by_global_norm(tf.gradients(total_loss, tvars),
                                             10.0)
        gradient_op = opt.apply_gradients(list(zip(grads, tvars)),
                                          global_step=global_step)
    else:
        gradient_op = opt.minimize(loss=total_loss, global_step=global_step)
    summary_op = tf.summary.merge_all()

    # 定义滑动平均对象
    variable_averages = tf.train.ExponentialMovingAverage(
        config.moving_average_decay, global_step)
    # 将该滑动平均对象作用于所有的可训练变量。tf.trainable_variables()以列表的形式返回所有可训练变量
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    # 下面这两句话等价于 train_op = tf.group(variables_averages_op, apply_gradient_op, batch_norm_updates_op)
    with tf.control_dependencies(
        [variables_averages_op, gradient_op, batch_norm_updates_op]):
        train_op = tf.no_op(name='train_op')

    saver = tf.train.Saver(tf.global_variables())
    summary_writer = tf.summary.FileWriter(config.summary_path,
                                           tf.get_default_graph())

    init = tf.global_variables_initializer()

    next_batch = producer().make_one_shot_iterator().get_next()
    # dataset_train = cocoData.CocoDataset(dataset_dir=config.dataset_dir,subset=config.subset,year=config.year)
    # dataset_train.prepare()
    # augmentation = imgaug.augmenters.Fliplr(0.5)
    # data_generator = cocoData.get_batch(num_workers=4,dataset=dataset_train,shuffle=False,augmentation=augmentation)

    config2 = tf.ConfigProto(allow_soft_placement=True)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    config2.gpu_options.allow_growth = True

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          gpu_options=gpu_options)) as sess:

        # 如果是从原来的模型中接着训练，就不需要sess.run(tf.global_variables_initializer())
        if config.restore:
            print('continue training from previous checkpoint')
            ckpt = tf.train.latest_checkpoint(config.checkpoint_path)
            saver.restore(sess, ckpt)
        elif tf.gfile.Exists(weights_path):
            sess.run(init)
            try:
                print("trying to assign pre-trained model...")
                load_trained_weights(weights_path, sess, ignore_missing=True)
                print("assign pre-trained model done!")
            except:
                raise 'loading pre-trained model failed,please check your pretrained ' \
                      'model {:s}'.format(config.COCO_WEIGHTS_PATH)
        else:
            print("use initial parameters")
            sess.run(init)
        if FLAGS.debug and FLAGS.tensorboard_debug_address:
            raise ValueError(
                "The --debug and --tensorboard_debug_adress flags are mutually exclusive"
            )
        if FLAGS.debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess,
                                                        ui_type=FLAGS.ui_type)
        elif FLAGS.tensorboard_debug_address:
            sess = tf_debug.TensorBoardDebugWrapperSession(
                sess, FLAGS.tensorboard_debug_address)

        start = time.time()
        avg_ml = 0.0
        avg_tl = 0.0
        for step in range(config.max_steps):

            image_name,image,  gt_box, gt_class, segmentation_mask, anchor_labels, anchor_deltas_in \
                = sess.run(next_batch)
            while gt_box.shape[1] == 0:
                image_name, image, gt_box, gt_class, segmentation_mask, anchor_labels, anchor_deltas_in \
                    = sess.run(next_batch)
            # data = next(data_generator)
            # i = 0
            # while data is None and i < 5:
            #     data = next(data_generator)
            #     i += 1
            #     print("None type appear for 5 times")
            #     exit(0)
            #
            # image, _, anchor_labels, anchor_deltas_in, gt_class, gt_box, segmentation_mask = data

            # inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
            #           batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]

            try:
                ml, tl, _, r_loss, p_loss, m_loss = sess.run(
                    [
                        model_loss, total_loss, train_op, rpn_loss,
                        proposal_loss, mask_loss
                    ],
                    feed_dict={
                        input_images: image,
                        gt_boxes: gt_box,
                        class_ids: gt_class,
                        input_gt_mask: segmentation_mask,
                        rpn_binary_gt: anchor_labels,
                        anchor_deltas: anchor_deltas_in
                    })
            except ValueError:
                print("maybe no gt in this step")

            if np.isnan(tl):
                print('Loss diverged, stop training')
                break
            else:
                avg_ml += ml
                avg_tl += tl

            if step % 10 == 0:
                avg_time_per_step = (time.time() - start) / 10
                start = time.time()
                print(
                    'Step {}, model loss {:.4f}, total loss {:.4f}, {:.2f} seconds/step'
                    .format(step, avg_ml / 10, avg_tl / 10, avg_time_per_step))
                avg_ml = 0.0
                avg_tl = 0.0

            if step % config.save_checkpoint_steps == 0:
                filename = os.path.join(config.checkpoint_path, "model.ckpt")
                saver.save(sess, filename, global_step=global_step)

            if step % config.save_summary_steps == 0:
                _, tl, summary_str = sess.run(
                    [train_op, total_loss, summary_op],
                    feed_dict={
                        input_images: image,
                        gt_boxes: gt_box,
                        class_ids: gt_class,
                        input_gt_mask: segmentation_mask,
                        rpn_binary_gt: anchor_labels,
                        anchor_deltas: anchor_deltas_in
                    })
                summary_writer.add_summary(summary_str, global_step=step)

コード例 #21

ファイルを表示

def train():
  # Import data
  mnist = input_data.read_data_sets(FLAGS.data_dir,
                                    fake_data=FLAGS.fake_data)

  sess = tf.InteractiveSession()

  if FLAGS.tensorboard_debug_address:
    sess = tf_debug.TensorBoardDebugWrapperSession(
        sess, FLAGS.tensorboard_debug_address)

  # Create a multilayer model.

  # Input placeholders
  with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
    y_ = tf.placeholder(tf.int64, [None], name='y-input')

  with tf.name_scope('input_reshape'):
    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
    tf.summary.image('input', image_shaped_input, 10)

  with tf.name_scope('conv_layer'):
    # Convolutional Layer
    W_conv = weight_variable([5, 5, 1, 12])
    b_conv = bias_variable([12])
    conv = tf.nn.conv2d(
        image_shaped_input,
        W_conv,
        strides=[1, 1, 1, 1],
        padding='SAME')
    h = tf.nn.relu(conv + b_conv)
    pool = tf.nn.max_pool(h,
                          ksize=[1, 2, 2, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')

  pool_flattened = tf.layers.flatten(pool)
  hidden1 = nn_layer(pool_flattened, 2352, 42, 'layer1')

  with tf.name_scope('dropout'):
    keep_prob = tf.placeholder(tf.float32)
    tf.summary.scalar('dropout_keep_probability', keep_prob)
    dropped = tf.nn.dropout(hidden1, keep_prob)

  # Do not apply softmax activation yet, see below.
  y = nn_layer(dropped, 42, 10, 'layer2', act=tf.identity)

  with tf.name_scope('cross_entropy'):
    # The raw formulation of cross-entropy,
    #
    # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
    #                               reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.losses.sparse_softmax_cross_entropy on the
    # raw logit outputs of the nn_layer above, and then average across
    # the batch.
    with tf.name_scope('total'):
      one_hot_labels = tf.cast(tf.one_hot(y_, depth=10), tf.float32)
      cross_entropy = tf.reduce_mean(
          -tf.reduce_sum(one_hot_labels * tf.log(y)))
  tf.summary.scalar('cross_entropy', cross_entropy)

  with tf.name_scope('train'):
    train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
        cross_entropy)

  with tf.name_scope('accuracy'):
    with tf.name_scope('correct_prediction'):
      correct_prediction = tf.equal(tf.argmax(y, 1), y_)
    with tf.name_scope('accuracy'):
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  tf.summary.scalar('accuracy', accuracy)

  # Merge all the summaries and write them out to
  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
  merged = tf.summary.merge_all()
  train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
  test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
  tf.global_variables_initializer().run()

  # Train the model, and also write summaries.
  # Every 10th step, measure test-set accuracy, and write test summaries
  # All other steps, run train_step on training data, & add training summaries

  def feed_dict(train):
    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
    if train or FLAGS.fake_data:
      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
      k = FLAGS.dropout
    else:
      xs, ys = mnist.test.images, mnist.test.labels
      k = 1.0
    return {x: xs, y_: ys, keep_prob: k}

  for i in range(FLAGS.max_steps):
    if i % 10 == 0:  # Record summaries and test-set accuracy
      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
      test_writer.add_summary(summary, i)
      print('Accuracy at step %s: %s' % (i, acc))
    else:  # Record train set summaries, and train
      if i % 100 == 99:  # Record execution stats
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        summary, _ = sess.run([merged, train_step],
                              feed_dict=feed_dict(True),
                              options=run_options,
                              run_metadata=run_metadata)
        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
        train_writer.add_summary(summary, i)
        print('Adding run metadata for', i)
      else:  # Record a summary
        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
        train_writer.add_summary(summary, i)
  train_writer.close()
  test_writer.close()