Exemple #1
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # delete previous saving checkpoints and model
    # if os.path.exists('./checkpoints') and os.path.isdir('./checkpoints'):
    #     shutil.rmtree('./checkpoints')
    if os.path.exists(os.path.join(home, 'data', 'model')) and os.path.isdir(
            os.path.join(home, 'data', 'model')):
        shutil.rmtree(os.path.join(home, 'data', 'model'))

    # Data set sources : http://archive.ics.uci.edu/ml/datasets/ \
    # Smartphone-Based+Recognition+of+Human+Activities+and+Postural+Transitions
    # sensorData_timestamp.txt is pre-processed data and is based on UCI datasets.
    # load dataset from DB
    mysql_to_csv(sql='Select * From sensorData',
                 file_path='./sensorData_timestamp1.csv',
                 host='163.180.117.202',
                 port=3847,
                 user='******',
                 password='******',
                 dbName='hardbnew')
    columns = [
        'user', 'activity', 'timestamp', 'acc_x-axis', 'acc_y-axis',
        'acc_z-axis', 'gyro_x-axis', 'gyro_y-axis', 'gyro_z-axis'
    ]
    df = pd.read_csv('./sensorData_timestamp1.csv',
                     header=None,
                     names=columns,
                     lineterminator='\n')
    df = df.dropna()

    step = 20
    segments = []
    labels = []
    for i in range(0, len(df) - n_time_steps, step):
        acc_xs = df['acc_x-axis'].values[i:i + n_time_steps]
        acc_ys = df['acc_y-axis'].values[i:i + n_time_steps]
        acc_zs = df['acc_z-axis'].values[i:i + n_time_steps]
        gyro_xs = df['gyro_x-axis'].values[i:i + n_time_steps]
        gyro_ys = df['gyro_y-axis'].values[i:i + n_time_steps]
        gyro_zs = df['gyro_z-axis'].values[i:i + n_time_steps]
        label = stats.mode(df['activity'][i:i + n_time_steps])[0][0]
        segments.append([acc_xs, acc_ys, acc_zs, gyro_xs, gyro_ys, gyro_zs])
        labels.append(label)

    reshaped_segments = np.asarray(segments, dtype=np.float32).reshape(
        -1, n_time_steps, n_features)
    tmp_df = pd.get_dummies(labels)
    labels = np.asarray(tmp_df, dtype=np.float32)
    reverse_one_hot_encode = tmp_df.idxmax().reset_index().rename(columns={
        'index': 'activity',
        0: 'idx'
    })
    pickle.dump(
        reverse_one_hot_encode,
        open(os.path.join(home, 'data', 'reverse_one_hot_encode'), "wb"))

    # Data split train : test = 80 : 20
    # This split method cause overfit. We need to K-fold taining method.
    x_train, x_test, y_train, y_test = train_test_split(
        reshaped_segments, labels, test_size=0.2, random_state=random_seed)
    pickle.dump(x_test, open(os.path.join(home, 'data', 'x_test'), "wb"))
    pickle.dump(y_test, open(os.path.join(home, 'data', 'y_test'), "wb"))

    # Build model...
    with tf.name_scope('input'):
        x = tf.placeholder(tf.float32, [None, n_time_steps, n_features],
                           name="inputs")
        y = tf.placeholder(tf.float32, [None, n_classes], name="label")
    predict, loss = create_lstm_model(x, y)
    tf.summary.scalar("loss", loss)
    # correct_pred = tf.equal(tf.argmax(predict, 1), tf.argmax(y, 1))
    # accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32))

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    # Horovod: add Horovod Distributed Optimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)

    global_step = tf.train.get_or_create_global_step()
    train_op = optimizer.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=8000 // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
        tf.train.SummarySaverHook(save_secs=10,
                                  output_dir='/tmp/tf',
                                  summary_op=tf.summary.merge_all())
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=batch_size)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            input_batch, target = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={x: input_batch, y: target})

    # save model
    if hvd.rank() != 0:
        return
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    optGraph = optimize_for_inference_lib.optimize_for_inference(
        tf.get_default_graph().as_graph_def(), ["input/inputs"], ["y_"],
        dtypes.float32.as_datatype_enum)
    frozenGraph = freeze_graph.freeze_graph_with_def_protos(
        optGraph, None, checkpoint_file, "y_", None, None, "frozen.pb", True,
        None)
    with tf.Graph().as_default():
        importer.import_graph_def(frozenGraph, name="")
        with tf.Session() as sess:
            inputs = tf.get_default_graph().get_tensor_by_name(
                "input/inputs:0")
            model = tf.get_default_graph().get_tensor_by_name("y_:0")
            predictor = tf.argmax(model, 1, name="predictor")
            inputs_classes = tf.saved_model.utils.build_tensor_info(
                inputs)  # input
            outputs_classes = tf.saved_model.utils.build_tensor_info(
                predictor)  # output
            signature = (tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                    inputs_classes
                },
                outputs={
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                    outputs_classes
                },
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))
            builder = tf.saved_model.builder.SavedModelBuilder(
                os.path.join(home, 'data', 'model'))
            legacy_init_op = tf.group(tf.tables_initializer(),
                                      name='legacy_init_op')
            builder.add_meta_graph_and_variables(
                sess, [tf.saved_model.tag_constants.SERVING],
                signature_def_map={'predict_activity': signature},
                legacy_init_op=legacy_init_op)
            builder.save()
Exemple #2
0
def main():
    print("Local rank: ", hvd.local_rank(), hvd.size())

    logdir = osp.join(FLAGS.logdir, FLAGS.exp)
    if hvd.rank() == 0:
        if not osp.exists(logdir):
            os.makedirs(logdir)
        logger = TensorBoardOutputFormat(logdir)
    else:
        logger = None

    LABEL = None
    print("Loading data...")
    if FLAGS.dataset == 'cifar10':
        dataset = Cifar10(augment=FLAGS.augment, rescale=FLAGS.rescale)
        test_dataset = Cifar10(train=False, rescale=FLAGS.rescale)
        channel_num = 3

        X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32)

        if FLAGS.large_model:
            model = ResNet32Large(num_channels=channel_num,
                                  num_filters=128,
                                  train=True)
        elif FLAGS.larger_model:
            model = ResNet32Larger(num_channels=channel_num, num_filters=128)
        elif FLAGS.wider_model:
            model = ResNet32Wider(num_channels=channel_num, num_filters=192)
        else:
            model = ResNet32(num_channels=channel_num, num_filters=128)

    elif FLAGS.dataset == 'imagenet':
        dataset = Imagenet(train=True)
        test_dataset = Imagenet(train=False)
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 32, 32, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32)

        model = ResNet32Wider(num_channels=channel_num, num_filters=256)

    elif FLAGS.dataset == 'imagenetfull':
        channel_num = 3
        X_NOISE = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 128, 128, 3), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 1000), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 1000), dtype=tf.float32)

        model = ResNet128(num_channels=channel_num, num_filters=64)

    elif FLAGS.dataset == 'mnist':
        dataset = Mnist(rescale=FLAGS.rescale)
        test_dataset = dataset
        channel_num = 1
        X_NOISE = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 28, 28), dtype=tf.float32)
        LABEL = tf.placeholder(shape=(None, 10), dtype=tf.float32)
        LABEL_POS = tf.placeholder(shape=(None, 10), dtype=tf.float32)

        model = MnistNet(num_channels=channel_num,
                         num_filters=FLAGS.num_filters)

    elif FLAGS.dataset == 'dsprites':
        dataset = DSprites(cond_shape=FLAGS.cond_shape,
                           cond_size=FLAGS.cond_size,
                           cond_pos=FLAGS.cond_pos,
                           cond_rot=FLAGS.cond_rot)
        test_dataset = dataset
        channel_num = 1

        X_NOISE = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32)
        X = tf.placeholder(shape=(None, 64, 64), dtype=tf.float32)

        if FLAGS.dpos_only:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.dsize_only:
            LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        elif FLAGS.drot_only:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.cond_size:
            LABEL = tf.placeholder(shape=(None, 1), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        elif FLAGS.cond_shape:
            LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32)
        elif FLAGS.cond_pos:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        elif FLAGS.cond_rot:
            LABEL = tf.placeholder(shape=(None, 2), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 2), dtype=tf.float32)
        else:
            LABEL = tf.placeholder(shape=(None, 3), dtype=tf.float32)
            LABEL_POS = tf.placeholder(shape=(None, 3), dtype=tf.float32)

        model = DspritesNet(num_channels=channel_num,
                            num_filters=FLAGS.num_filters,
                            cond_size=FLAGS.cond_size,
                            cond_shape=FLAGS.cond_shape,
                            cond_pos=FLAGS.cond_pos,
                            cond_rot=FLAGS.cond_rot)

    print("Done loading...")

    if FLAGS.dataset == "imagenetfull":
        # In the case of full imagenet, use custom_tensorflow dataloader
        data_loader = TFImagenetLoader('train',
                                       FLAGS.batch_size,
                                       hvd.rank(),
                                       hvd.size(),
                                       rescale=FLAGS.rescale)
    else:
        data_loader = DataLoader(dataset,
                                 batch_size=FLAGS.batch_size,
                                 num_workers=FLAGS.data_workers,
                                 drop_last=True,
                                 shuffle=True)

    batch_size = FLAGS.batch_size

    weights = [model.construct_weights('context_0')]

    Y = tf.placeholder(shape=(None), dtype=tf.int32)

    # Varibles to run in training
    X_SPLIT = tf.split(X, FLAGS.num_gpus)
    X_NOISE_SPLIT = tf.split(X_NOISE, FLAGS.num_gpus)
    LABEL_SPLIT = tf.split(LABEL, FLAGS.num_gpus)
    LABEL_POS_SPLIT = tf.split(LABEL_POS, FLAGS.num_gpus)
    LABEL_SPLIT_INIT = list(LABEL_SPLIT)
    tower_grads = []
    tower_gen_grads = []
    x_mod_list = []

    optimizer = AdamOptimizer(FLAGS.lr, beta1=0.0, beta2=0.999)
    optimizer = hvd.DistributedOptimizer(optimizer)

    for j in range(FLAGS.num_gpus):

        if FLAGS.model_cclass:
            ind_batch_size = FLAGS.batch_size // FLAGS.num_gpus
            label_tensor = tf.Variable(tf.convert_to_tensor(np.reshape(
                np.tile(np.eye(10), (FLAGS.batch_size, 1, 1)),
                (FLAGS.batch_size * 10, 10)),
                                                            dtype=tf.float32),
                                       trainable=False,
                                       dtype=tf.float32)
            x_split = tf.tile(
                tf.reshape(X_SPLIT[j], (ind_batch_size, 1, 32, 32, 3)),
                (1, 10, 1, 1, 1))
            x_split = tf.reshape(x_split, (ind_batch_size * 10, 32, 32, 3))
            energy_pos = model.forward(x_split,
                                       weights[0],
                                       label=label_tensor,
                                       stop_at_grad=False)

            energy_pos_full = tf.reshape(energy_pos, (ind_batch_size, 10))
            energy_partition_est = tf.reduce_logsumexp(energy_pos_full,
                                                       axis=1,
                                                       keepdims=True)
            uniform = tf.random_uniform(tf.shape(energy_pos_full))
            label_tensor = tf.argmax(-energy_pos_full -
                                     tf.log(-tf.log(uniform)) -
                                     energy_partition_est,
                                     axis=1)
            label = tf.one_hot(label_tensor, 10, dtype=tf.float32)
            label = tf.Print(label, [label_tensor, energy_pos_full])
            LABEL_SPLIT[j] = label
            energy_pos = tf.concat(energy_pos, axis=0)
        else:
            energy_pos = [
                model.forward(X_SPLIT[j],
                              weights[0],
                              label=LABEL_POS_SPLIT[j],
                              stop_at_grad=False)
            ]
            energy_pos = tf.concat(energy_pos, axis=0)

        print("Building graph...")
        x_mod = x_orig = X_NOISE_SPLIT[j]

        x_grads = []

        energy_negs = []
        loss_energys = []

        energy_negs.extend([
            model.forward(tf.stop_gradient(x_mod),
                          weights[0],
                          label=LABEL_SPLIT[j],
                          stop_at_grad=False,
                          reuse=True)
        ])
        eps_begin = tf.zeros(1)

        steps = tf.constant(0)
        c = lambda i, x: tf.less(i, FLAGS.num_steps)

        def langevin_step(counter, x_mod):
            x_mod = x_mod + tf.random_normal(
                tf.shape(x_mod),
                mean=0.0,
                stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale)

            energy_noise = energy_start = tf.concat([
                model.forward(x_mod,
                              weights[0],
                              label=LABEL_SPLIT[j],
                              reuse=True,
                              stop_at_grad=False,
                              stop_batch=True)
            ],
                                                    axis=0)

            x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise,
                                              [x_mod, LABEL_SPLIT[j]])
            energy_noise_old = energy_noise

            lr = FLAGS.step_lr

            if FLAGS.proj_norm != 0.0:
                if FLAGS.proj_norm_type == 'l2':
                    x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm)
                elif FLAGS.proj_norm_type == 'li':
                    x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm,
                                              FLAGS.proj_norm)
                else:
                    print("Other types of projection are not supported!!!")
                    assert False

            # Clip gradient norm for now
            if FLAGS.hmc:
                # Step size should be tuned to get around 65% acceptance
                def energy(x):
                    return FLAGS.temperature * \
                        model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True)

                x_last = hmc(x_mod, 15., 10, energy)
            else:
                x_last = x_mod - (lr) * x_grad

            x_mod = x_last
            x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale)

            counter = counter + 1

            return counter, x_mod

        steps, x_mod = tf.while_loop(c, langevin_step, (steps, x_mod))

        energy_eval = model.forward(x_mod,
                                    weights[0],
                                    label=LABEL_SPLIT[j],
                                    stop_at_grad=False,
                                    reuse=True)
        x_grad = tf.gradients(FLAGS.temperature * energy_eval, [x_mod])[0]
        x_grads.append(x_grad)

        energy_negs.append(
            model.forward(tf.stop_gradient(x_mod),
                          weights[0],
                          label=LABEL_SPLIT[j],
                          stop_at_grad=False,
                          reuse=True))

        test_x_mod = x_mod

        temp = FLAGS.temperature

        energy_neg = energy_negs[-1]
        x_off = tf.reduce_mean(
            tf.abs(x_mod[:tf.shape(X_SPLIT[j])[0]] - X_SPLIT[j]))

        loss_energy = model.forward(x_mod,
                                    weights[0],
                                    reuse=True,
                                    label=LABEL,
                                    stop_grad=True)

        print("Finished processing loop construction ...")

        target_vars = {}

        if FLAGS.cclass or FLAGS.model_cclass:
            label_sum = tf.reduce_sum(LABEL_SPLIT[0], axis=0)
            label_prob = label_sum / tf.reduce_sum(label_sum)
            label_ent = -tf.reduce_sum(
                label_prob * tf.math.log(label_prob + 1e-7))
        else:
            label_ent = tf.zeros(1)

        target_vars['label_ent'] = label_ent

        if FLAGS.train:

            if FLAGS.objective == 'logsumexp':
                pos_term = temp * energy_pos
                energy_neg_reduced = (energy_neg - tf.reduce_min(energy_neg))
                coeff = tf.stop_gradient(tf.exp(-temp * energy_neg_reduced))
                norm_constant = tf.stop_gradient(tf.reduce_sum(coeff)) + 1e-4
                pos_loss = tf.reduce_mean(temp * energy_pos)
                neg_loss = coeff * (-1 * temp * energy_neg) / norm_constant
                loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
            elif FLAGS.objective == 'cd':
                pos_loss = tf.reduce_mean(temp * energy_pos)
                neg_loss = -tf.reduce_mean(temp * energy_neg)
                loss_ml = FLAGS.ml_coeff * (pos_loss + tf.reduce_sum(neg_loss))
            elif FLAGS.objective == 'softplus':
                loss_ml = FLAGS.ml_coeff * \
                    tf.nn.softplus(temp * (energy_pos - energy_neg))

            loss_total = tf.reduce_mean(loss_ml)

            if not FLAGS.zero_kl:
                loss_total = loss_total + tf.reduce_mean(loss_energy)

            loss_total = loss_total + \
                FLAGS.l2_coeff * (tf.reduce_mean(tf.square(energy_pos)) + tf.reduce_mean(tf.square((energy_neg))))

            print("Started gradient computation...")
            gvs = optimizer.compute_gradients(loss_total)
            gvs = [(k, v) for (k, v) in gvs if k is not None]

            print("Applying gradients...")

            tower_grads.append(gvs)

            print("Finished applying gradients.")

            target_vars['loss_ml'] = loss_ml
            target_vars['total_loss'] = loss_total
            target_vars['loss_energy'] = loss_energy
            target_vars['weights'] = weights
            target_vars['gvs'] = gvs

        target_vars['X'] = X
        target_vars['Y'] = Y
        target_vars['LABEL'] = LABEL
        target_vars['LABEL_POS'] = LABEL_POS
        target_vars['X_NOISE'] = X_NOISE
        target_vars['energy_pos'] = energy_pos
        target_vars['energy_start'] = energy_negs[0]

        if len(x_grads) >= 1:
            target_vars['x_grad'] = x_grads[-1]
            target_vars['x_grad_first'] = x_grads[0]
        else:
            target_vars['x_grad'] = tf.zeros(1)
            target_vars['x_grad_first'] = tf.zeros(1)

        target_vars['x_mod'] = x_mod
        target_vars['x_off'] = x_off
        target_vars['temp'] = temp
        target_vars['energy_neg'] = energy_neg
        target_vars['test_x_mod'] = test_x_mod
        target_vars['eps_begin'] = eps_begin

    if FLAGS.train:
        grads = average_gradients(tower_grads)
        train_op = optimizer.apply_gradients(grads)
        target_vars['train_op'] = train_op

    config = tf.ConfigProto()

    if hvd.size() > 1:
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    sess = tf.Session(config=config)

    saver = loader = tf.train.Saver(max_to_keep=30,
                                    keep_checkpoint_every_n_hours=6)

    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print("Model has a total of {} parameters".format(total_parameters))

    sess.run(tf.global_variables_initializer())

    resume_itr = 0

    if (FLAGS.resume_iter != -1 or not FLAGS.train) and hvd.rank() == 0:
        model_file = osp.join(logdir, 'model_{}'.format(FLAGS.resume_iter))
        resume_itr = FLAGS.resume_iter
        # saver.restore(sess, model_file)
        optimistic_restore(sess, model_file)

    sess.run(hvd.broadcast_global_variables(0))
    print("Initializing variables...")

    print("Start broadcast")
    print("End broadcast")

    if FLAGS.train:
        print("Training phase")
        train(target_vars, saver, sess, logger, data_loader, resume_itr,
              logdir)
    print("Testing phase")
    test(target_vars, saver, sess, logger, data_loader)
Exemple #3
0
        logits = fully_connected(hidden2,
                                 n_outputs,
                                 scope="outputs",
                                 activation_fn=None)
        no_op = tf.no_op(name="no_op")

    with tf.name_scope("loss"):
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")

    learning_rate = 0.01

    with tf.name_scope("train"):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        optimizer = hvd.DistributedOptimizer(optimizer)
        training_op = optimizer.minimize(loss, name='optimize')

    with tf.name_scope("eval"):
        correct = tf.nn.in_top_k(logits, y, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    init = tf.global_variables_initializer()
    bcast = hvd.broadcast_global_variables(0)

    n_epochs = 20
    batch_size = 100

    def shuffle_batch(X, y, batch_size):
        rnd_idx = np.random.permutation(len(X))
        n_batches = len(X) // batch_size
epochs = 1000
keep_probability = 0.5  #0.5 dropout per the paper going with 0.7 since 0.5 just doesn't work with regularization never converges, and loss goes up, back to 0.5
starter_learning_rate = 0.001  # changed to .1 from 0.01; changed to 0.001 from 0.01, back to 0.01
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.compat.v1.train.exponential_decay(starter_learning_rate,
                                                     global_step,
                                                     100000,
                                                     0.96,
                                                     staircase=True)
acc, cost = model(x, y, keep_probability)
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                  beta1=0.9,
                                  beta2=0.999,
                                  epsilon=0.1)

train_op = hvd.DistributedOptimizer(train_op)
train_op = train_op.minimize(cost, global_step=global_step)  # weight decay

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())

iter = train_ds.make_initializable_iterator()
val_iter = val_ds.make_initializable_iterator()
iter_op = iter.get_next()
val_iter_op = val_iter.get_next()

with tf.Session(config=config) as sess:
    sess.run(tf.global_variables_initializer())
    bcast = hvd.broadcast_global_variables(0)
    sess.run(bcast)
Exemple #5
0
def start_training(config):
    if config.IS_DISTRIBUTION:
        import horovod.tensorflow as hvd
        # initialize Horovod.
        hvd.init()
        num_worker = hvd.size()
        rank = hvd.rank()
        # verify that MPI multi-threading is supported.
        assert hvd.mpi_threads_supported()
        # make sure MPI is not re-initialized.
        import mpi4py.rc
        mpi4py.rc.initialize = False
        # import mpi4py
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        # check size and rank are syncronized
        assert num_worker == comm.Get_size()
        assert rank == comm.Get_rank()
    else:
        num_worker = 1
        rank = 0

    ModelClass = config.NETWORK_CLASS
    network_kwargs = dict(
        (key.lower(), val) for key, val in config.NETWORK.items())
    if "train_validation_saving_size".upper() in config.DATASET.keys():
        use_train_validation_saving = config.DATASET.TRAIN_VALIDATION_SAVING_SIZE > 0
    else:
        use_train_validation_saving = False

    if use_train_validation_saving:
        top_train_validation_saving_set_accuracy = 0

    train_dataset = setup_dataset(config, "train", rank)
    print("train dataset num:", train_dataset.num_per_epoch)

    if use_train_validation_saving:
        train_validation_saving_dataset = setup_dataset(
            config, "train_validation_saving", rank)
        print("train_validation_saving dataset num:",
              train_validation_saving_dataset.num_per_epoch)

    validation_dataset = setup_dataset(config, "validation", rank)
    print("validation dataset num:", validation_dataset.num_per_epoch)

    graph = tf.Graph()
    with graph.as_default():
        if ModelClass.__module__.startswith("lmnet.networks.object_detection"):
            model = ModelClass(
                classes=train_dataset.classes,
                num_max_boxes=train_dataset.num_max_boxes,
                is_debug=config.IS_DEBUG,
                **network_kwargs,
            )
        elif ModelClass.__module__.startswith("lmnet.networks.segmentation"):
            model = ModelClass(
                classes=train_dataset.classes,
                label_colors=train_dataset.label_colors,
                is_debug=config.IS_DEBUG,
                **network_kwargs,
            )
        else:
            model = ModelClass(
                classes=train_dataset.classes,
                is_debug=config.IS_DEBUG,
                **network_kwargs,
            )

        global_step = tf.Variable(0, name="global_step", trainable=False)
        is_training_placeholder = tf.placeholder(
            tf.bool, name="is_training_placeholder")

        images_placeholder, labels_placeholder = model.placeholderes()

        output = model.inference(images_placeholder, is_training_placeholder)
        if ModelClass.__module__.startswith("lmnet.networks.object_detection"):
            loss = model.loss(output, labels_placeholder,
                              is_training_placeholder)
        else:
            loss = model.loss(output, labels_placeholder)
        opt = model.optimizer(global_step)
        if config.IS_DISTRIBUTION:
            # add Horovod Distributed Optimizer
            opt = hvd.DistributedOptimizer(opt)
        train_op = model.train(loss, opt, global_step)
        metrics_ops_dict, metrics_update_op = model.metrics(
            output, labels_placeholder)
        # TODO(wakisaka): Deal with many networks.
        model.summary(output, labels_placeholder)

        summary_op = tf.summary.merge_all()

        metrics_summary_op, metrics_placeholders = executor.prepare_metrics(
            metrics_ops_dict)

        init_op = tf.global_variables_initializer()
        reset_metrics_op = tf.local_variables_initializer()
        if config.IS_DISTRIBUTION:
            # add Horovod broadcasting variables from rank 0 to all
            bcast_global_variables_op = hvd.broadcast_global_variables(0)

        if use_train_validation_saving:
            saver = tf.train.Saver(max_to_keep=1)
        else:
            saver = tf.train.Saver(max_to_keep=None)

        if config.IS_PRETRAIN:
            all_vars = tf.global_variables()
            pretrain_var_list = [
                var for var in all_vars
                if var.name.startswith(tuple(config.PRETRAIN_VARS))
            ]
            print("pretrain_vars", [var.name for var in pretrain_var_list])
            pretrain_saver = tf.train.Saver(pretrain_var_list,
                                            name="pretrain_saver")

    if config.IS_DISTRIBUTION:
        # For distributed training
        session_config = tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True, visible_device_list=str(hvd.local_rank())))
    else:
        # TODO(wakisaka): For debug.
        # session_config = tf.ConfigProto(
        #     gpu_options=tf.GPUOptions(
        #         allow_growth=True,
        #         per_process_gpu_memory_fraction=0.1
        #     )
        # )
        session_config = tf.ConfigProto(
        )  # tf.ConfigProto(log_device_placement=True)
    # TODO(wakisaka): XLA JIT
    # session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    sess = tf.Session(graph=graph, config=session_config)
    sess.run([init_op, reset_metrics_op])

    if rank == 0:
        train_writer = tf.summary.FileWriter(
            environment.TENSORBOARD_DIR + "/train", sess.graph)
        if use_train_validation_saving:
            train_val_saving_writer = tf.summary.FileWriter(
                environment.TENSORBOARD_DIR + "/train_validation_saving")
        val_writer = tf.summary.FileWriter(environment.TENSORBOARD_DIR +
                                           "/validation")

        if config.IS_PRETRAIN:
            print("------- Load pretrain data ----------")
            pretrain_saver.restore(
                sess, os.path.join(config.PRETRAIN_DIR, config.PRETRAIN_FILE))
            sess.run(tf.assign(global_step, 0))

        last_step = 0

        # for recovery
        ckpt = tf.train.get_checkpoint_state(environment.CHECKPOINTS_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            print("--------- Restore last checkpoint -------------")
            saver.restore(sess, ckpt.model_checkpoint_path)
            # saver.recover_last_checkpoints(ckpt.model_checkpoint_path)
            last_step = sess.run(global_step)
            # TODO(wakisaka): tensorflow v1.3 remain previous event log in tensorboard.
            # https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/python/training/supervisor.py#L1072
            train_writer.add_session_log(SessionLog(status=SessionLog.START),
                                         global_step=last_step + 1)
            val_writer.add_session_log(SessionLog(status=SessionLog.START),
                                       global_step=last_step + 1)
            print("recovered. last step", last_step)

    if config.IS_DISTRIBUTION:
        # broadcast variables from rank 0 to all other processes
        sess.run(bcast_global_variables_op)
        # calculate step per epoch for each nodes
        train_num_per_epoch = train_dataset.num_per_epoch
        num_per_nodes = (train_num_per_epoch + num_worker - 1) // num_worker
        step_per_epoch = num_per_nodes // config.BATCH_SIZE
        begin_index = (train_num_per_epoch * rank) // num_worker
        end_index = begin_index + num_per_nodes

    last_step = sess.run(global_step)

    # Calculate max steps. The priority of config.MAX_EPOCHS is higher than config.MAX_STEPS.
    if "MAX_EPOCHS" in config:
        max_steps = int(train_dataset.num_per_epoch / config.BATCH_SIZE *
                        config.MAX_EPOCHS)
    else:
        max_steps = config.MAX_STEPS
    print("max_steps: {}".format(max_steps))

    for step in range(last_step, max_steps):
        print("step", step)

        if config.IS_DISTRIBUTION:
            # scatter dataset
            if step % step_per_epoch == 0:
                indices = train_dataset.get_shuffle_index(
                ) if rank == 0 else None
                # broadcast shuffled indices
                indices = comm.bcast(indices, 0)
                feed_indices = indices[begin_index:end_index]
                # update each dataset by splited indices
                train_dataset.update_dataset(feed_indices)

        images, labels = train_dataset.feed()

        feed_dict = {
            is_training_placeholder: True,
            images_placeholder: images,
            labels_placeholder: labels,
        }

        if step * ((step + 1) % config.SUMMARISE_STEPS) == 0 and rank == 0:
            # Runtime statistics for develop.
            # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            # run_metadata = tf.RunMetadata()

            sess.run(reset_metrics_op)
            _, summary, _ = sess.run(
                [train_op, summary_op, metrics_update_op],
                feed_dict=feed_dict,
                # options=run_options,
                # run_metadata=run_metadata,
            )
            # train_writer.add_run_metadata(run_metadata, "step: {}".format(step + 1))
            train_writer.add_summary(summary, step + 1)

            metrics_values = sess.run(list(metrics_ops_dict.values()))
            metrics_feed_dict = {
                placeholder: value
                for placeholder, value in zip(metrics_placeholders,
                                              metrics_values)
            }

            metrics_summary, = sess.run(
                [metrics_summary_op],
                feed_dict=metrics_feed_dict,
            )
            train_writer.add_summary(metrics_summary, step + 1)
        else:
            sess.run([train_op], feed_dict=feed_dict)

        to_be_saved = step == 0 or (
            step + 1) == max_steps or (step + 1) % config.SAVE_STEPS == 0

        if to_be_saved and rank == 0:
            if use_train_validation_saving:

                sess.run(reset_metrics_op)
                train_validation_saving_step_size = int(
                    math.ceil(train_validation_saving_dataset.num_per_epoch /
                              config.BATCH_SIZE))
                print("train_validation_saving_step_size",
                      train_validation_saving_step_size)

                current_train_validation_saving_set_accuracy = 0

                for train_validation_saving_step in range(
                        train_validation_saving_step_size):
                    print("train_validation_saving_step",
                          train_validation_saving_step)

                    images, labels = train_validation_saving_dataset.feed()
                    feed_dict = {
                        is_training_placeholder: False,
                        images_placeholder: images,
                        labels_placeholder: labels,
                    }

                    if train_validation_saving_step % config.SUMMARISE_STEPS == 0:
                        summary, _ = sess.run([summary_op, metrics_update_op],
                                              feed_dict=feed_dict)
                        train_val_saving_writer.add_summary(summary, step + 1)
                    else:
                        sess.run([metrics_update_op], feed_dict=feed_dict)

                metrics_values = sess.run(list(metrics_ops_dict.values()))
                metrics_feed_dict = {
                    placeholder: value
                    for placeholder, value in zip(metrics_placeholders,
                                                  metrics_values)
                }
                metrics_summary, = sess.run(
                    [metrics_summary_op],
                    feed_dict=metrics_feed_dict,
                )
                train_val_saving_writer.add_summary(metrics_summary, step + 1)

                current_train_validation_saving_set_accuracy = sess.run(
                    metrics_ops_dict["accuracy"])

                if current_train_validation_saving_set_accuracy > top_train_validation_saving_set_accuracy:
                    top_train_validation_saving_set_accuracy = current_train_validation_saving_set_accuracy
                    print("New top train_validation_saving accuracy is: ",
                          top_train_validation_saving_set_accuracy)

                    _save_checkpoint(saver, sess, global_step, step)

            else:
                _save_checkpoint(saver, sess, global_step, step)

            if step == 0:
                # check create pb on only first step.
                minimal_graph = tf.graph_util.convert_variables_to_constants(
                    sess,
                    sess.graph.as_graph_def(add_shapes=True),
                    ["output"],
                )
                pb_name = "minimal_graph_with_shape_{}.pb".format(step + 1)
                pbtxt_name = "minimal_graph_with_shape_{}.pbtxt".format(step +
                                                                        1)
                tf.train.write_graph(minimal_graph,
                                     environment.CHECKPOINTS_DIR,
                                     pb_name,
                                     as_text=False)
                tf.train.write_graph(minimal_graph,
                                     environment.CHECKPOINTS_DIR,
                                     pbtxt_name,
                                     as_text=True)

        if step == 0 or (step + 1) % config.TEST_STEPS == 0:
            # init metrics values
            sess.run(reset_metrics_op)
            test_step_size = int(
                math.ceil(validation_dataset.num_per_epoch /
                          config.BATCH_SIZE))
            print("test_step_size", test_step_size)

            for test_step in range(test_step_size):
                print("test_step", test_step)

                images, labels = validation_dataset.feed()
                feed_dict = {
                    is_training_placeholder: False,
                    images_placeholder: images,
                    labels_placeholder: labels,
                }

                if test_step % config.SUMMARISE_STEPS == 0:
                    summary, _ = sess.run([summary_op, metrics_update_op],
                                          feed_dict=feed_dict)
                    if rank == 0:
                        val_writer.add_summary(summary, step + 1)
                else:
                    sess.run([metrics_update_op], feed_dict=feed_dict)

            metrics_values = sess.run(list(metrics_ops_dict.values()))
            metrics_feed_dict = {
                placeholder: value
                for placeholder, value in zip(metrics_placeholders,
                                              metrics_values)
            }
            metrics_summary, = sess.run(
                [metrics_summary_op],
                feed_dict=metrics_feed_dict,
            )
            if rank == 0:
                val_writer.add_summary(metrics_summary, step + 1)

    # training loop end.
    print("reach max step")
Exemple #6
0
def main(argv):
    # Initialize Horovod.
    hvd.init()
    # Download and load MNIST dataset.
    mnist = learn.datasets.mnist.read_data_sets(training_data_dir)

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)

    opt = tf.train.RMSPropOptimizer(0.01)

    # Add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    # BroadcastGlobalVariablesHook broadcasts initial variable states from rank 0
    # to all other processes. This is necessary to ensure consistent initialization
    # of all workers when training is started with random weights or restored
    # from a checkpoint.


    # Save checkpoints only on worker 0 to prevent other workers from corrupting them.
    checkpoint_dir = checkpoint_path if hvd.rank() == 0 else None

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=20),

        tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                   every_n_iter=10),
    ]


    # Pin GPU to be used to process local rank (one GPU per process)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())


    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.

    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(100)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})

    mon_sess.
Exemple #7
0
def _cnn_model_function(features, labels, mode, params):
    model_func = params['model']
    model_format = params['format']
    model_dtype = params['dtype']
    momentum = params['momentum']
    learning_rate_init = params['learning_rate_init']
    learning_rate_power = params['learning_rate_power']
    decay_steps = params['decay_steps']
    weight_decay = params['weight_decay']
    loss_scale = params['loss_scale']
    larc_eta = params['larc_eta']
    larc_mode = params['larc_mode']
    deterministic = params['deterministic']
    num_classes = params['n_classes']
    dali_cpu = params['dali_cpu']

    device = '/gpu:0'
    labels = tf.reshape(labels, (-1, ))  # Squash unnecessary unary dim
    inputs = features  # TODO: Should be using feature columns?
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    with tf.device(device):
        inputs = tf.cast(inputs, model_dtype)
        if model_format == 'channels_first':
            inputs = tf.transpose(inputs, [0, 3, 1, 2])
        with nvutils.fp32_trainable_vars(
                regularizer=tf.contrib.layers.l2_regularizer(weight_decay)):
            top_layer = model_func(inputs, training=is_training)
            logits = tf.layers.dense(top_layer, num_classes)
        predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
        logits = tf.cast(logits, tf.float32)
        if mode == tf.estimator.ModeKeys.PREDICT:
            probabilities = tf.softmax(logits)
            predictions = {
                'class_ids': predicted_classes[:, None],
                'probabilities': probabilities,
                'logits': logits
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)
        loss = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                      labels=labels)
        loss = tf.identity(
            loss, name='loss'
        )  # For access by logger (TODO: Better way to access it?)
        reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        loss = tf.add_n([loss] + reg_losses, name='total_loss')
        with tf.device(
                None):  # Allow fallback to CPU if no GPU support for these ops
            top1_accuracy = tf.metrics.accuracy(labels=labels,
                                                predictions=predicted_classes)
            top5_accuracy = tf.metrics.mean(
                tf.nn.in_top_k(predictions=logits, targets=labels, k=5))
            tf.summary.scalar('top1_accuracy', top1_accuracy[1])
            tf.summary.scalar('top5_accuracy', top5_accuracy[1])
        if mode == tf.estimator.ModeKeys.EVAL:
            metrics = {
                'top1_accuracy': top1_accuracy,
                'top5_accuracy': top5_accuracy
            }
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=metrics)
        assert (mode == tf.estimator.ModeKeys.TRAIN)
        #batch_size = inputs.shape[0]
        batch_size = tf.shape(inputs)[0]
        learning_rate = tf.train.polynomial_decay(learning_rate_init,
                                                  tf.train.get_global_step(),
                                                  decay_steps=decay_steps,
                                                  end_learning_rate=0.,
                                                  power=learning_rate_power,
                                                  cycle=False,
                                                  name='learning_rate')
        opt = tf.train.MomentumOptimizer(learning_rate,
                                         momentum,
                                         use_nesterov=True)
        opt = hvd.DistributedOptimizer(opt)
        opt = nvutils.LarcOptimizer(opt,
                                    learning_rate,
                                    larc_eta,
                                    clip=larc_mode)
        opt = nvutils.LossScalingOptimizer(opt, scale=loss_scale)
        gate_gradients = (tf.train.Optimizer.GATE_OP
                          if deterministic else tf.train.Optimizer.GATE_NONE)
        train_op = opt.minimize(loss,
                                global_step=tf.train.get_global_step(),
                                gate_gradients=gate_gradients,
                                name='step_update')
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
        train_op = tf.group(train_op, update_ops)
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Exemple #8
0
  def training_deploy(self, model, funcs=[]):
    # Horovod: initialize Horovod (prepare MPI envoriment)
    if self.is_distribute_training:
      import horovod.tensorflow as hvd
      hvd.init()

      # reset num_clones = 1
      self.num_clones = 1
      self.rank = hvd.rank()
      self.local_rank = hvd.local_rank()

      get_global_context().quiet = False if self.rank == 0 else True

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default() as graph:
      # Default graph
      self.graph = graph
      # Session
      config = tf.ConfigProto(allow_soft_placement=True)
      config.gpu_options.allow_growth = True
      devices = self.ctx.devices if len(self.ctx.devices) > 0 else self.devices
      config.gpu_options.visible_device_list = ','.join(str(x) for x in devices) if len(devices) > 0 else ''
      self.sess = tf.Session(graph=graph, config=config)
      
      #######################
      # Config model deploy #
      #######################
      deploy_config = tfmodel_deploy.DeploymentConfig(num_clones=self.num_clones,
                                                      devices=[],
                                                      clone_on_cpu=self.clone_on_cpu,
                                                      replica_id=self.replica_id,
                                                      num_replicas=self.worker_replicas,
                                                      num_ps_tasks=self.num_ps_tasks,
                                                      clone_id_map={0:self.local_rank} if self.is_distribute_training else {})

      # init some info
      with tf.device(deploy_config.inputs_device()):
        # Create global_step
        with tf.device(deploy_config.variables_device()):
          global_step = slim.get_or_create_global_step()

        ###################################
        ####    define model input (CPU) ##
        ###################################
        with tf.variable_scope('input'):
          data_queue = self.ctx.model.model_input(self.is_training)
          if data_queue is not None:
            self._has_model_input = True
        
        ###################################
        ####    define model (CPU or GPU) #
        ###################################
        func = model.model_fn
        @functools.wraps(func)
        def network_fn(*args, **kwargs):
          res = func(self.is_training, *args, **kwargs)
          if kwargs['clone'] == 0:
            # 1.step save graph file
            tf.train.write_graph(self.sess.graph_def, self.dump_dir, 'graph.pbtxt')
            
            # # 2.step transfer to local graph net
            # logger.info('build model graph svg')
            # svg_graph = _convert_to_svg_graph(os.path.join(self.dump_dir, 'graph.pbtxt'),
            #                                   self.dump_dir,
            #                                   ['input'])
            # if svg_graph is not None:
            #   self.ctx.job.send({'DATA': {'GRAPH': svg_graph}})
          return res

        ####################################
        ####### Create summary      ########
        ####################################
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        ####################################
        ####### Create model clones ########
        ####################################
        self.clones = tfmodel_deploy.create_clones(deploy_config,
                                                   network_fn,
                                                   [data_queue] if data_queue is not None else None)
        first_clone_scope = deploy_config.clone_scope(0)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
          summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

        # create other func
        for create_func in funcs:
          self._create_funcs.append(create_func())

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.optimizer_device()):
          # samples total number
          num_samples = self.num_samples if self.num_samples > 0 else self.ctx.data_source.size

          # Horovod: adjust learning rate based on number of GPUs
          self.lr = _configure_learning_rate(self, num_samples, global_step)
          summaries.add(tf.summary.scalar('learning_rate', self.lr))

          # config optimizer
          optimizer = _configure_optimizer(self, self.lr)

          # Horovod: add Horovod Distributed Optimizer
          if self.is_distribute_training:
            optimizer = hvd.DistributedOptimizer(optimizer)

        # Variables to train.
        variables_to_train = _get_variables_to_train(self)
        
        with tf.control_dependencies(self.model_dependence):
          # Train_tensor
          total_loss, clones_gradients = \
            tfmodel_deploy.optimize_clones(self.clones,
                                           optimizer,
                                           regularization_losses=None if self.regularization_loss else [],
                                           var_list=variables_to_train)

          summaries.add(tf.summary.scalar('total_loss', total_loss))

          # Create gradient updates.
          grad_updates = optimizer.apply_gradients(clones_gradients,
                                                   global_step=global_step)

        # Value ops
        update_ops.append(grad_updates)
        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
          self.val_ops = tf.identity(total_loss, name='train_op')
  
          if self.clones[0].outputs is not None:
            self.val_ops = [self.val_ops]
            if type(self.clones[0].outputs) == list:
              self.val_ops.extend(self.clones[0].outputs)
            elif type(self.clones[0].outputs) == tuple:
              self.val_ops.extend(list(self.clones[0].outputs))
            else:
              self.val_ops.append(self.clones[0].outputs)

          if type(self.val_ops) != list:
            self.val_ops = [self.val_ops]

        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                           first_clone_scope))

        # Merge all summaries together.
        self.summary_op = tf.summary.merge(list(summaries), name='summary_op')

        if self.summary_op is not None:
          val_ops_temp = [self.summary_op]
          val_ops_temp.extend(self.val_ops)
          self.val_ops = val_ops_temp

        # summary write
        if not os.path.exists(os.path.join(self.dump_dir, 'summary')):
          os.makedirs(os.path.join(self.dump_dir, 'summary'))

        self.train_writer = tf.summary.FileWriter(os.path.join(self.dump_dir, 'summary'), graph)

        # Global initialization
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(tf.local_variables_initializer())
        # coord
        self.coord = tf.train.Coordinator()
        self.threads = tf.train.start_queue_runners(sess=self.sess, coord=self.coord)
        
        custom_dataset_queue = tf.get_collection('CUSTOM_DATASET_QUEUE')
        if len(custom_dataset_queue) > 0:
          custom_dataset_queue[0].coord = self.coord
          custom_threads = custom_dataset_queue[0].start_threads(self.sess)
          self.threads.extend(custom_threads)
        
        # Training saver
        # model_variables = slim.get_model_variables() if model.model_variables is None else model.model_variables
        self.saver = tf.train.Saver(max_to_keep=2)

        # Restore from checkpoint
        if not self.is_distribute_training or (self.is_distribute_training and self.rank == 0):
          restore_fns = _get_init_fn(self, model, self.dump_dir, self.ctx)
          if restore_fns is not None:
            for restore_fn in restore_fns:
              restore_fn(self.sess)

          # Restore from custom auxilary init funcs
          for func in self._aux_init_funcs:
            func(self.sess)

        # resotre from auxilary checkpoint
        for auxilary_scope, auxilary_checkpoint in self.auxilary_checkpoints.items():
          self.restore_scopy_from(model, auxilary_scope, auxilary_checkpoint)

        # Horovod boardcast global variables
        if self.is_distribute_training:
          bgv = hvd.BroadcastGlobalVariablesHook(0)
          bgv.begin()
          bgv.after_create_session(self.sess, self.coord)
Exemple #9
0
def create_optimizer(hparams, loss):
    """
    Creates an optimizer training op.
    If the parameter lr_bert is specified, then use another adam for this learning rate.
    """
    tvars = tf.trainable_variables()

    # Print trainable variables
    print("# Trainable variables")
    total_param = 0
    for param in tvars:
        if param.name.startswith('bert'):
            psize = 1
            for s in param.get_shape():
                psize *= s
            total_param += psize
        print("  %s, %s, %s" % (param.name, str(param.get_shape()), param.op.device))
    print('total bert parameters:', total_param)

    # Define optimizer parameters
    init_lr = hparams.learning_rate
    num_train_steps = hparams.num_train_steps
    num_warmup_steps = hparams.num_warmup_steps
    lr_bert = hparams.lr_bert

    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    if hparams.optimizer == "bert_adam":
        # Using optimizer with bert's implementation
        # Implements linear decay of the learning rate.
        learning_rate = tf.train.polynomial_decay(
            learning_rate,
            global_step,
            num_train_steps,
            end_learning_rate=0.0,
            power=1.0,
            cycle=False)

        # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
        # learning rate will be `global_step/num_warmup_steps * init_lr`.
        if num_warmup_steps:
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = init_lr * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

        # It is recommended that you use this optimizer for fine tuning, since this
        # is how the model was trained (note that the Adam m/v variables are NOT
        # loaded from init_checkpoint.)
        optimizer = AdamWeightDecayOptimizer(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        if hparams.use_horovod:
            import horovod.tensorflow as hvd
            # Horovod's distributed optimizer handles allreduce calls, synchronous only
            optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
            grads_and_vars = optimizer.compute_gradients(loss, tvars)
            grads = [grad for grad, var in grads_and_vars]
            tvars = [var for grad, var in grads_and_vars]
        else:
            grads = tf.gradients(loss, tvars)

        grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=1.0)

        if lr_bert is None:
            # If not a separate learning rate for bert (lr_bert) is specified,
            # all components use the same learning rate
            train_op = optimizer.apply_gradients(
                zip(grads, tvars), global_step=global_step)

            # Normally the global step update is done inside of `apply_gradients`.
            # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
            # a different optimizer, you should probably take this line out.
            new_global_step = global_step + 1
            train_op = tf.group(train_op, [global_step.assign(new_global_step)])
        else:
            # the BERT components will use another learning rate
            optimizer_bert = AdamWeightDecayOptimizer(
                learning_rate=learning_rate * lr_bert / init_lr,
                weight_decay_rate=0.01,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-6,
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
            if hparams.use_horovod:
                # Treat the bert optimizer the same as the original optimizer: wrapped with horovod
                optimizer_bert = hvd.DistributedOptimizer(optimizer_bert, sparse_as_dense=True)

            bert_grad, bert_tvars = [], []
            other_grad, other_tvars = [], []
            for grad, tvar in zip(grads, tvars):
                if tvar is not None and grad is not None:
                    if tvar.name.startswith('bert'):
                        bert_grad.append(grad)
                        bert_tvars.append(tvar)
                        print('****bert param:', tvar.name)
                    else:
                        other_grad.append(grad)
                        other_tvars.append(tvar)
                        print('****other param:', tvar.name)
            print('--------------\n', '# of bert', len(bert_grad), '# of other', len(other_grad), '\n--------------')
            bert_train_op = optimizer_bert.apply_gradients(
                zip(bert_grad, bert_tvars), global_step=global_step)
            other_train_op = optimizer.apply_gradients(
                zip(other_grad, other_tvars), global_step=global_step)

            new_global_step = global_step + 1
            train_op = tf.group(bert_train_op, other_train_op, [global_step.assign(new_global_step)])

        return train_op, grad_norm, learning_rate

    elif hparams.optimizer == "sgd":
        opt = tf.train.GradientDescentOptimizer(learning_rate)
    elif hparams.optimizer == "adam":
        opt = tf.train.AdamOptimizer(learning_rate)
    else:
        raise ValueError("Only support sgd/adam/bert_adam as optimizer option")

    # Gradients
    gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)
    clipped_gradients, grad_norm = tf.clip_by_global_norm(gradients, hparams.max_gradient_norm)
    train_op = opt.apply_gradients(zip(clipped_gradients, tvars), global_step=global_step)

    return train_op, grad_norm, learning_rate
Exemple #10
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
            gt_classes_list = labels[
                fields.InputDataFields.groundtruth_classes]
            gt_masks_list = None
            if fields.InputDataFields.groundtruth_instance_masks in labels:
                gt_masks_list = labels[
                    fields.InputDataFields.groundtruth_instance_masks]
            gt_keypoints_list = None
            if fields.InputDataFields.groundtruth_keypoints in labels:
                gt_keypoints_list = labels[
                    fields.InputDataFields.groundtruth_keypoints]
            gt_weights_list = None
            if fields.InputDataFields.groundtruth_weights in labels:
                gt_weights_list = labels[
                    fields.InputDataFields.groundtruth_weights]
            gt_confidences_list = None
            if fields.InputDataFields.groundtruth_confidences in labels:
                gt_confidences_list = labels[
                    fields.InputDataFields.groundtruth_confidences]
            gt_is_crowd_list = None
            if fields.InputDataFields.groundtruth_is_crowd in labels:
                gt_is_crowd_list = labels[
                    fields.InputDataFields.groundtruth_is_crowd]
            detection_model.provide_groundtruth(
                groundtruth_boxes_list=gt_boxes_list,
                groundtruth_classes_list=gt_classes_list,
                groundtruth_confidences_list=gt_confidences_list,
                groundtruth_masks_list=gt_masks_list,
                groundtruth_keypoints_list=gt_keypoints_list,
                groundtruth_weights_list=gt_weights_list,
                groundtruth_is_crowd_list=gt_is_crowd_list)

        preprocessed_images = features[fields.InputDataFields.image]
        if use_tpu and train_config.use_bfloat16:
            with tf.contrib.tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape])
                for k, v in prediction_dict.items():
                    if v.dtype == tf.bfloat16:
                        prediction_dict[k] = tf.cast(v, tf.float32)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape])

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = tf.contrib.tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            if train_config.fine_tune_checkpoint and hparams.load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            losses_dict = detection_model.loss(
                prediction_dict,
                features[fields.InputDataFields.true_image_shape])
            losses = [loss_tensor for loss_tensor in losses_dict.values()]
            if train_config.add_regularization_loss:
                regularization_losses = detection_model.regularization_losses()
                if regularization_losses:
                    regularization_loss = tf.add_n(regularization_losses,
                                                   name='regularization_loss')
                    losses.append(regularization_loss)
                    losses_dict[
                        'Loss/regularization_loss'] = regularization_loss
            total_loss = tf.add_n(losses, name='total_loss')
            losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
                    training_optimizer)

            ## ADDED for multi-gpu
            training_optimizer = hvd.DistributedOptimizer(
                training_optimizer, device_dense='/cpu:0')

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = tf.contrib.layers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
def main(_):
    """
    Builds the model and runs
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    if len(config_train.name) > 0:
        output_dir = os.path.join(FLAGS.output_dir, config_train.name)
    else:
        output_dir = FLAGS.output_dir
    tx.utils.maybe_create_dir(output_dir)

    ## Loads GPT-2 model configuration

    if FLAGS.config_type == "json":
        gpt2_config = model_utils.transform_gpt2_to_texar_config(
            FLAGS.config_model)
    elif FLAGS.config_type == 'texar':
        gpt2_config = importlib.import_module(FLAGS.config_model)
    else:
        raise ValueError('Unknown config_type.')

    # Creates a data pre-processor for, e.g., BPE encoding
    proc = processor.get_encoder(FLAGS.pretrained_model_dir)
    end_token = proc.encoder['<|endoftext|>']

    max_decoding_length = config_train.max_decoding_length
    assert max_decoding_length <= gpt2_config.position_size, (
        "max_decoding_length should not be greater than position_size. "
        "{}>{}".format(max_decoding_length, gpt2_config.position_size))

    ## Loads data

    # Configures training data shard in distribued mode
    if FLAGS.distributed:
        config_train.train_hparam["dataset"]["num_shards"] = hvd.size()
        config_train.train_hparam["dataset"]["shard_id"] = hvd.rank()
        config_train.train_hparam["batch_size"] //= hvd.size()

    datasets = {}
    #if FLAGS.do_train:
    train_dataset = tx.data.TFRecordData(hparams=config_train.train_hparam)
    datasets['train'] = train_dataset
    #if FLAGS.do_eval:
    dev_dataset = tx.data.TFRecordData(hparams=config_train.dev_hparam)
    datasets['dev'] = dev_dataset
    #if FLAGS.do_test:
    test_dataset = tx.data.TFRecordData(hparams=config_train.test_hparam)
    datasets['test'] = test_dataset
    iterator = tx.data.FeedableDataIterator(datasets)
    batch = iterator.get_next()
    batch_size = tf.shape(batch['x1x4_ids'])[0]

    ## Builds the GPT-2 model
    vocab_size = gpt2_config.vocab_size

    word_embedder = tx.modules.WordEmbedder(vocab_size=vocab_size,
                                            hparams=gpt2_config.embed)

    pos_embedder = tx.modules.PositionEmbedder(
        position_size=gpt2_config.position_size, hparams=gpt2_config.pos_embed)

    # Ties output layer with input word embedding
    output_layer = tf.transpose(word_embedder.embedding, (1, 0))

    decoder = tx.modules.TransformerDecoder(vocab_size=vocab_size,
                                            output_layer=output_layer,
                                            hparams=gpt2_config.decoder)

    def _embedding_fn(ids, times):
        return word_embedder(ids) + pos_embedder(times)

    # For training
    def _get_recon_loss(ids,
                        full_len,
                        prefix_len=None,
                        mask_prefix=True,
                        do_print=False):
        ids = ids[:, :tf.reduce_max(full_len)]
        batch_size__ = tf.shape(ids)[0]
        seq_len = tf.fill([batch_size__], tf.shape(ids)[1])
        pos_embeds = pos_embedder(sequence_length=seq_len)
        input_embeds = word_embedder(ids) + pos_embeds

        # greedy output
        outputs = decoder(inputs=input_embeds,
                          decoding_strategy='train_greedy')

        max_full_len = tf.reduce_max(full_len)
        ids = ids[:, :max_full_len]
        logits = outputs.logits[:, :max_full_len]

        if mask_prefix:
            loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy(
                labels=ids[:, 1:],
                logits=logits[:, :-1, :],
                sequence_length=full_len - 1,
                average_across_timesteps=False,
                sum_over_timesteps=False,
                average_across_batch=False,
                sum_over_batch=False)
            mask_recon = tf.sequence_mask(full_len - 1, dtype=tf.float32)
            mask_recon_prefix = 1 - tf.sequence_mask(
                prefix_len - 1,
                maxlen=max_full_len - 1,  #max_decoding_length-1,
                dtype=tf.float32)
            mask_recon = mask_recon * mask_recon_prefix

            if do_print:
                print_op_1 = tf.print(mask_recon)
                loss_recon_flat = tx.utils.reduce_with_weights(
                    tensor=loss_recon,
                    weights=mask_recon,
                    average_across_remaining=False,
                    sum_over_remaining=False,
                    average_across_batch=False)
                print_op_2 = tf.print(loss_recon_flat)
                with tf.control_dependencies([print_op_1, print_op_2]):
                    loss_recon = tx.utils.reduce_with_weights(
                        tensor=loss_recon,
                        weights=mask_recon,
                        average_across_remaining=True,
                        sum_over_remaining=False)
                return loss_recon, mask_recon, loss_recon_flat
            else:
                loss_recon = tx.utils.reduce_with_weights(
                    tensor=loss_recon,
                    weights=mask_recon,
                    average_across_remaining=True,
                    sum_over_remaining=False)
        else:
            loss_recon = tx.losses.sequence_sparse_softmax_cross_entropy(
                labels=ids[:, 1:],
                logits=logits[:, :-1, :],
                sequence_length=full_len - 1,
                average_across_timesteps=True,
                sum_over_timesteps=False,
                average_across_batch=False,
                sum_over_batch=False)

        return loss_recon

    # For RL fine-tuning
    def _get_sample_story(context_ids, context_len):
        sample_output, sample_len = decoder(
            decoding_strategy='infer_sample',
            embedding=_embedding_fn,
            context=context_ids,
            context_sequence_length=context_len,
            max_decoding_length=max_decoding_length,
            end_token=end_token,
            softmax_temperature=FLAGS.temperature,
            mode=tf.estimator.ModeKeys.PREDICT)

        return sample_output, sample_len
        # return ids, batch_loss, ids_len

    def _get_sample_rolled(output, length, context_len):

        ids = output.sample_id
        ids = tx.utils.varlength_roll(ids,
                                      -context_len)  # final sample ids rolled
        ids_len = length - context_len
        ids = ids[:, :tf.reduce_max(ids_len)]

        return ids, ids_len

    def compute_batch_loss(output, sample_len, context_len):
        max_full_len = tf.reduce_max(sample_len)
        ids = output.sample_id[:, :max_full_len]
        logits = output.logits[:, :max_full_len]  #(bs, sl, vocab)

        sampleLogprobs = tx.losses.sequence_sparse_softmax_cross_entropy(
            labels=ids[:, 1:],
            logits=logits,
            sequence_length=sample_len - 1,
            average_across_timesteps=False,
            sum_over_timesteps=False,
            average_across_batch=False,
            sum_over_batch=False)

        mask = tf.sequence_mask(sample_len - 1, dtype=tf.float32)
        mask_prefix = 1 - tf.sequence_mask(
            context_len - 1,
            maxlen=max_full_len - 1,  #max_decoding_length-1,
            dtype=tf.float32)
        mask = mask * mask_prefix

        batch_loss = tx.utils.reduce_with_weights(
            tensor=sampleLogprobs,
            weights=mask,
            average_across_batch=False,
            average_across_remaining=True,
            sum_over_remaining=False)

        return batch_loss

    def _get_greedy_story(context_ids, context_len):

        greedy_res, greedy_len = decoder(
            decoding_strategy='infer_greedy',
            embedding=_embedding_fn,
            context=context_ids,
            context_sequence_length=context_len,
            max_decoding_length=max_decoding_length,
            end_token=end_token,
            mode=tf.estimator.ModeKeys.PREDICT)

        greedy_ids = tx.utils.varlength_roll(greedy_res.sample_id,
                                             -context_len)
        greedy_ids_len = greedy_len - context_len
        greedy_ids = greedy_ids[:, :tf.reduce_max(greedy_ids_len)]

        return greedy_ids, greedy_ids_len

    ## ROC Loss-1: ML loss
    x1_len = tf.placeholder(tf.int32, shape=[None], name='x1_len')
    x1x4_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1x4_ids')
    x1x4_len = tf.placeholder(tf.int32, shape=[None], name='x1x4_len')

    loss_fine = _get_recon_loss(x1x4_ids, x1x4_len, x1_len)

    x1_ids = tf.placeholder(tf.int32, shape=[None, None], name='x1_ids')
    reward = tf.placeholder(tf.float32, shape=[None], name="reward")
    sampled_story = tf.placeholder(tf.int32,
                                   shape=[None, None],
                                   name="sampled_story")  #smilar to sample_que
    sampled_story_len = tf.placeholder(tf.int32,
                                       shape=[None],
                                       name='sample_story_len')

    ## Loss-2: RL loss
    symbols_output, symbols_len = _get_sample_story(x1_ids, x1_len)
    symbols_rl, len_rl = _get_sample_rolled(symbols_output, symbols_len,
                                            x1_len)
    symbols_gr, len_gr = _get_greedy_story(x1_ids, x1_len)
    batch_loss_rl = _get_recon_loss(sampled_story,
                                    sampled_story_len,
                                    mask_prefix=False)
    rl_loss_fine = tf.reduce_mean(batch_loss_rl * reward)

    def _get_beam_ids(context_ids, context_len, target):
        # beam-search
        predictions = decoder(beam_width=5,
                              length_penalty=config_train.length_penalty,
                              embedding=_embedding_fn,
                              context=context_ids,
                              context_sequence_length=context_len,
                              max_decoding_length=max_decoding_length,
                              end_token=end_token,
                              mode=tf.estimator.ModeKeys.PREDICT)

        beam_output_ids = tx.utils.varlength_roll(
            predictions["sample_id"][:, :, 0], -context_len)
        target_ids = tx.utils.varlength_roll(target, -context_len)

        return beam_output_ids, target_ids

    target_ids = tx.utils.varlength_roll(x1x4_ids, -x1_len)

    tau = tf.placeholder(tf.float32, shape=[], name='tau')

    if not FLAGS.sc_rl:
        loss = config_train.w_fine * loss_fine

        loss_dict = {
            'loss': loss,
            'loss_fine': config_train.w_fine * loss_fine,
        }

    else:
        loss = (1 - config_train.w_rl
                ) * config_train.w_fine * loss_fine + config_train.w_rl * (
                    config_train.w_fine_rl * rl_loss_fine)  #

        loss_dict = {
            'loss':
            loss,
            'loss_fine':
            (1 - config_train.w_rl) * config_train.w_fine * loss_fine,
            'rl_loss_fine':
            config_train.w_rl * config_train.w_fine_rl * rl_loss_fine,
        }

    ## Inference

    def _infer(context_name):
        helper = tx.modules.TopKSampleEmbeddingHelper(
            embedding=_embedding_fn,
            start_tokens=batch['%s_ids' % context_name][:, 0],
            end_token=end_token,
            top_k=FLAGS.top_k,
            softmax_temperature=FLAGS.temperature)
        outputs_infer, len_infer = decoder(
            context=batch['%s_ids' % context_name],
            context_sequence_length=batch['%s_len' % context_name],
            max_decoding_length=max_decoding_length,
            helper=helper)  # outputs_infer contains sample_id and logits
        yy_ids = tx.utils.varlength_roll(
            outputs_infer.sample_id,
            -batch['%s_len' %
                   context_name])  # shift beginning indices (context) to end
        yy_len = len_infer - batch['%s_len' % context_name]
        yy_ids = yy_ids[:, :tf.reduce_max(yy_len)]
        return yy_ids, yy_len

    x4_ids_fine, x4_len_fine = _infer('x1')

    def _infer_beam_ids(context_name):
        # beam-search
        predictions = decoder(beam_width=5,
                              length_penalty=config_train.length_penalty,
                              embedding=_embedding_fn,
                              context=batch['%s_ids' % context_name],
                              context_sequence_length=batch['%s_len' %
                                                            context_name],
                              max_decoding_length=max_decoding_length,
                              end_token=end_token,
                              mode=tf.estimator.ModeKeys.PREDICT)

        beam_output_ids = tx.utils.varlength_roll(
            predictions["sample_id"][:, :, 0], -batch['%s_len' % context_name])

        return beam_output_ids

    beam_search_ids = _infer_beam_ids('x1')

    ## Optimization
    trainable_variables = tx.utils.collect_trainable_variables(
        [word_embedder, pos_embedder, decoder])

    global_step = tf.Variable(0, trainable=False)
    opt = tx.core.get_optimizer(global_step=global_step,
                                hparams=config_train.opt)

    if FLAGS.distributed:
        opt = hvd.DistributedOptimizer(opt)

    train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                               global_step=global_step,
                                               learning_rate=None,
                                               optimizer=opt,
                                               variables=trainable_variables)

    ## Train/eval/test routine
    saver = tf.train.Saver()
    saver_best = tf.train.Saver(max_to_keep=1)
    dev_best = {
        'loss': 1e8,
        'loss_fine': 1e8,
        'rl_loss_fine': 1e8,
        'best_reward': -1e8,
        'bleu': 0.,
        'meteor': 0.
    }  #'best_reward': -1e8

    def _log_losses(losses, step=None):
        loss_str = 'loss: %.4f, loss_fine: %.4f, rl_loss_fine: %.4f' % \
            (losses['loss'], losses['loss_fine'], losses['rl_loss_fine']
             )

        if step is not None:
            loss_str = 'step: %d, %s' % (step, loss_str)

        _log(loss_str)

    def _is_head():
        if not FLAGS.distributed:
            return True
        else:
            return hvd.rank() == 0

    def _train_epoch(sess, initial=False):
        """Trains on the training set, and evaluates on the dev set
        periodically.
        """
        # load train arc label data
        train_arc_file = [
            i.strip().split() for i in open(
                os.path.join(config_train.arc_data, "train_mapped.txt"))
        ]

        iterator.restart_dataset(sess, 'train')

        while True:
            try:
                # (1) Get data and yy sample
                fetches_data = {
                    'batch': batch,
                    'batch_size': batch_size,
                }
                feed_dict_data = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets_data = sess.run(fetches_data, feed_dict_data)

                reward_fetches = {
                    'sample_rl': symbols_rl,
                    'sample_len': len_rl,
                    'greedy_sym': symbols_gr,
                    'greedy_len': len_gr,
                }
                reward_rets = sess.run(reward_fetches,
                                       feed_dict={
                                           x1_ids:
                                           rets_data['batch']['x1_ids'],
                                           x1_len:
                                           rets_data['batch']['x1_len'],
                                           tx.global_mode():
                                           tf.estimator.ModeKeys.PREDICT
                                       })

                # prepare sample stories for classification
                ids_rl, text_rl = _get_text(
                    proc, reward_rets['sample_rl'],
                    reward_rets['sample_len'])  #list of list
                story_rl = format_generated_stories_for_clf(
                    text_rl, FLAGS.rl_method)
                #print("Rl Story: ", story_rl)
                _, text_base = _get_text(proc, reward_rets['greedy_sym'],
                                         reward_rets['greedy_len'])
                story_base = format_generated_stories_for_clf(
                    text_base, FLAGS.rl_method)
                #print("Greedy Story", story_base)

                # add reward calculation here
                reward_rl = get_reward(predictor,
                                       story_rl,
                                       rets_data['batch']['unique_id'],
                                       train_arc_file,
                                       method=FLAGS.rl_method)
                reward_base = get_reward(predictor,
                                         story_base,
                                         rets_data['batch']['unique_id'],
                                         train_arc_file,
                                         method=FLAGS.rl_method)

                # self-critical reward
                reward_sc = [
                    rr - rb for rr, rb in zip(reward_rl, reward_base)
                ]  # class list
                # print(reward_rl, reward_base, reward_sc)

                ids_rl = utils.list_strip_eos(ids_rl, end_token)
                new_in_sample_ids, new_in_sample_len = _fix(ids_rl, end_token)

                # (2) Optimize loss
                feed_dict = {
                    x1_ids: rets_data['batch']['x1_ids'],
                    x1_len: rets_data['batch']['x1_len'],
                    x1x4_ids: rets_data['batch']['x1x4_ids'],
                    x1x4_len: rets_data['batch']['x1x4_len'],
                    sampled_story: new_in_sample_ids,
                    sampled_story_len: new_in_sample_len,
                    tau: config_train.tau,
                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                    reward: np.array(reward_sc)
                }

                fetches = {
                    'train_op': train_op,
                    'step': global_step,
                }
                fetches.update(loss_dict)

                rets = sess.run(fetches, feed_dict, options=run_opts)
                step = rets['step']

                dis_steps = config_train.display_steps

                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
                    _log_losses(rets, step)

                eval_steps = config_train.eval_steps
                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
                    _dev_epoch(sess, evaluate_func=evaluate_full)
                # not used
                sample_steps = config_train.sample_steps
                if _is_head(
                ) and sample_steps > 0 and step % sample_steps == 0:
                    print('-----------testing-----------------')
                    _test_epoch(sess, step=step)
                # not used
                ckpt_steps = config_train.checkpoint_steps
                if _is_head() and ckpt_steps > 0 and step % ckpt_steps == 0:
                    ckpt_fn = os.path.join(output_dir, 'model.ckpt')
                    ckpt_fn = saver.save(sess, ckpt_fn, global_step=step)
                    _log('Checkpoint to {}'.format(ckpt_fn))

            except tf.errors.OutOfRangeError:
                break

    def _dev_epoch(sess, evaluate_func=evaluate_full):
        """Evaluates on the dev set.
        """
        dev_arc_file = [
            i.strip().split() for i in open(
                os.path.join(config_train.arc_data, "dev_mapped.txt"))
        ]
        with open(
                os.path.join(config_train.tfrecord_data_dir,
                             "x4_emo_features.dev"), 'rb') as fp:
            emotion_feats = np.array(pickle.load(fp))
        iterator.restart_dataset(sess, 'dev')

        nsamples = 0
        hypotheses = []
        references = []
        reward_score = []
        losses = []
        hypotheses_dict = {}

        while True:
            try:

                # (1) Get data and yy sample
                fetches_data = {
                    'batch': batch,
                    'batch_size': batch_size,
                }
                feed_dict_data = {
                    iterator.handle: iterator.get_handle(sess, 'dev'),
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets_data = sess.run(fetches_data, feed_dict_data)

                # (2) eval loss
                feed_dict = {
                    x1_ids: rets_data['batch']['x1_ids'],
                    x1_len: rets_data['batch']['x1_len'],
                    x1x4_ids: rets_data['batch']['x1x4_ids'],
                    x1x4_len: rets_data['batch']['x1x4_len'],
                    # x4_emo: rets_data['batch']['x4_emo'],
                    tau: config_train.tau,
                    tx.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }

                # rets_loss = sess.run(fetches, feed_dict)

                fetches = {
                    'loss_fine': loss_dict['loss_fine'],
                    #'beam_search_ids': beam_search_ids,
                    'greedy_sym': symbols_gr,
                    'greedy_len': len_gr,
                    'target_ids': target_ids
                }
                rets = sess.run(fetches, feed_dict)

                losses.append(rets['loss_fine'])
                _, beam_text = _get_text(proc, rets['greedy_sym'],
                                         rets['greedy_len'])
                beam_story = format_generated_stories_for_clf(
                    beam_text, FLAGS.rl_method)
                _, target_text = _get_text(proc, rets['target_ids'],
                                           rets_data['batch']['x1x4_len'])

                hypotheses.extend(beam_text)
                references.extend(target_text)
                hypotheses_dict_ = generate_all_valid_sample_dict(
                    predictor,
                    rets_data['batch']['unique_id'],
                    beam_story,
                    method=FLAGS.rl_method)
                for key, react in hypotheses_dict_.items():
                    if key not in hypotheses_dict:
                        hypotheses_dict[
                            key] = react  # dictionary key=unique_id value =list of list

                nsamples += rets_data['batch_size']
            except tf.errors.OutOfRangeError:
                break

        avg_loss = np.mean(losses)
        metrics = evaluate_func(references,
                                hypotheses,
                                hypotheses_dict,
                                dev_arc_file,
                                emotion_feats,
                                method=FLAGS.rl_method)
        msg = 'loss_fine: %.4f, bleu: %.4f, meteor: %.4f, reward: %.4f' % \
            (avg_loss, metrics['bleu'], metrics['meteor'], metrics["best_reward"]
             )

        _log('nsamples validation: %d' % nsamples)
        _log(msg)

        if FLAGS.best_model == "emotion":
            if FLAGS.do_train and metrics["best_reward"] > dev_best[
                    'best_reward']:
                # dev_best.update(results.avg())
                dev_best['loss_fine'] = avg_loss
                dev_best['best_reward'] = metrics["best_reward"]
                dev_best.update(metrics)
                ckpt_fn = os.path.join(output_dir, 'model_best.ckpt')
                ckpt_fn = saver_best.save(sess, ckpt_fn)
                _log('Checkpoint best to {}'.format(ckpt_fn))

        elif FLAGS.best_model == "bleu":
            if FLAGS.do_train and metrics["bleu"] > dev_best['bleu']:
                # dev_best.update(results.avg())
                dev_best['loss_fine'] = avg_loss
                dev_best['best_reward'] = metrics["best_reward"]
                dev_best.update(metrics)
                ckpt_fn = os.path.join(output_dir, 'model_best.ckpt')
                ckpt_fn = saver_best.save(sess, ckpt_fn)
                _log('Checkpoint best to {}'.format(ckpt_fn))

        elif FLAGS.do_train and avg_loss < dev_best['loss']:
            # dev_best.update(results.avg())
            dev_best['loss_fine'] = avg_loss
            dev_best.update(metrics)
            dev_best['best_reward'] = metrics["best_reward"]
            ckpt_fn = os.path.join(output_dir, 'model_best.ckpt')
            ckpt_fn = saver_best.save(sess, ckpt_fn)
            _log('Checkpoint best to {}'.format(ckpt_fn))

    def _test_epoch(sess, step=None):
        """Generates samples on the test set.
        """
        iterator.restart_dataset(sess, 'test')

        _all_inputs = []
        _all_samples = []

        if FLAGS.finetune:
            _log('Generation input: x1')
            fetches = {
                'inputs': batch['x1_ids'],
                'length': batch['x1_len'],
                'samples_length': x4_len_fine,
                'samples': x4_ids_fine
            }
            res_fn_appendix = "x1"

        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                rets = sess.run(fetches, feed_dict=feed_dict)

                _inputs = []
                for i, l in zip(rets['inputs'], rets['length']):
                    # Delete padding
                    _inputs.append(i[:l].tolist())
                _all_inputs.extend(_inputs)

                _samples = []
                for s, l in zip(rets['samples'], rets['samples_length']):
                    _samples.append(s[:l].tolist(
                    ))  # rets['samples'] are np array [bs, max_seq_len=200]

                _all_samples.extend(_samples)

            except tf.errors.OutOfRangeError:
                break

        # Parse samples and write to file

        eos_token_id = proc.encoder['<|endoftext|>']

        _all_input_text = []
        for i in _all_inputs:
            if i[0] == eos_token_id:
                i = i[1:]
            i_text = proc.decode(i)
            _all_input_text.append(i_text)
        _all_input_text = tx.utils.strip_eos(_all_input_text,
                                             eos_token='<|endoftext|>')

        _all_samples_text = []
        for i, s in zip(_all_inputs, _all_samples):
            s_text = proc.decode(s)
            s_text = s_text.strip(" |").replace('\n', ' ')
            _all_samples_text.append(s_text)
        _all_samples_text = tx.utils.strip_eos(_all_samples_text,
                                               eos_token='<|endoftext|>')

        if step is None:
            fn = "test_samples_%s.tsv" % res_fn_appendix
        else:
            fn = "test_samples_%s_%d.tsv" % (res_fn_appendix, step)
        output_file = os.path.join(output_dir, fn)
        _log('Write samples to {}'.format(output_file))
        tx.utils.write_paired_text(_all_input_text, _all_samples_text,
                                   output_file)

    # Broadcasts global variables from rank-0 process
    if FLAGS.distributed:
        bcast = hvd.broadcast_global_variables(0)

    session_config = tf.ConfigProto()
    if FLAGS.distributed:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())
        session_config.gpu_options = tf.GPUOptions(allow_growth=True)

    with tf.Session(config=session_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        #smry_writer = tf.summary.FileWriter(FLAGS.output_dir, graph=sess.graph)

        if FLAGS.distributed:
            bcast.run()

        #Restores trained model if specified
        if FLAGS.checkpoint:
            _log('Restore from {}'.format(FLAGS.checkpoint))
            saver.restore(sess, FLAGS.checkpoint)
        elif FLAGS.pretrain_checkpoint:
            _log('Restore from {}'.format(FLAGS.pretrain_checkpoint))
            model_utils.init_gpt2_checkpoint(sess, FLAGS.pretrain_checkpoint)
            print("\nFinished loading\n")
            saver.save(sess, output_dir + '/gpt2_model.ckpt')

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            for epoch in range(config_train.max_train_epoch):
                print("Training epoch {}".format(epoch))
                _train_epoch(sess, epoch == 0)
            saver.save(sess, output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            _dev_epoch(sess)

        if FLAGS.do_test:
            _test_epoch(sess)
def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(inputs=input_layer,
                             filters=32,
                             kernel_size=[5, 5],
                             padding="same",
                             activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 28, 28, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(inputs=pool1,
                             filters=64,
                             kernel_size=[5, 5],
                             padding="same",
                             activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 7, 7, 64]
    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat,
                            units=1024,
                            activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(inputs=dense,
                                rate=0.4,
                                training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 10]
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
    loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels,
                                           logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        # Horovod: scale learning rate by the number of workers.
        optimizer = tf.train.MomentumOptimizer(learning_rate=0.001 *
                                               hvd.size(),
                                               momentum=0.9)

        # Horovod: add Horovod Distributed Optimizer.
        optimizer = hvd.DistributedOptimizer(optimizer)

        train_op = optimizer.minimize(loss=loss,
                                      global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy":
        tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      eval_metric_ops=eval_metric_ops)
Exemple #13
0
def run_mnist(_):
    # Import data
    mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir +
                                                'MNIST-data-%d' % hvd.rank(),
                                                one_hot=True)

    # Create the model
    with tf.name_scope("mnist_placholder"):
        x = tf.placeholder(tf.float32, [None, 784])
        W = tf.Variable(tf.zeros([784, 10]))
        b = tf.Variable(tf.zeros([10]))
        y = tf.matmul(x, W) + b

        # Define loss and optimizer
        y_ = tf.placeholder(tf.float32, [None, 10])

    # The raw formulation of cross-entropy,
    #
    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
    #                                 reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
    # outputs of 'y', and then average across the batch.
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    #global_step = tf.train.get_or_create_global_step()
    global_step = tf.contrib.framework.get_or_create_global_step()
    opt = tf.train.GradientDescentOptimizer(0.5)
    # Add MPI Distributed Optimizer
    with tf.name_scope("horovod_opt"):
        opt = hvd.DistributedOptimizer(opt)
    train_step = opt.minimize(cross_entropy, global_step=global_step)

    # The StopAtStepHook handles stopping after running given steps.
    hooks = [
        hvd.BroadcastGlobalVariablesHook(0),
        tf.train.StopAtStepHook(last_step=10)
    ]

    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # Enable soft placement and tracing as needed
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=True,
                            inter_op_parallelism_threads=1)
    config_ngraph_enabled = ngraph_bridge.update_config(config)

    #config.graph_options.optimizer_options.global_jit_level = jit_level
    run_metadata = tf.RunMetadata()

    #init_op = tf.global_variables_initializer()
    print("Variables initialized ...")

    # The MonitoredTrainingSession takes care of session initialization
    with tf.train.MonitoredTrainingSession(
            hooks=hooks, config=config_ngraph_enabled) as mon_sess:
        start = time.time()
        train_writer = tf.summary.FileWriter(FLAGS.log_dir, mon_sess.graph)
        while not mon_sess.should_stop():
            # Train
            batch_xs, batch_ys = mnist.train.next_batch(100)
            mon_sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

            # Test trained model
            if not mon_sess.should_stop():
                print(
                    "Accuracy: ",
                    mon_sess.run(accuracy,
                                 feed_dict={
                                     x: mnist.test.images,
                                     y_: mnist.test.labels
                                 }))

        end = time.time()

    if hvd.rank() == 0:
        print("Training time: %f seconds" % (end - start))
Exemple #14
0
def main(_):
    mnist = input_data.read_data_sets(
        './mnist', one_hot=True)  # they has been normalized to range (0,1)
    test_x = mnist.test.images[:2000]
    test_y = mnist.test.labels[:2000]

    # plot one example
    print(mnist.train.images.shape)  # (55000, 28 * 28)
    print(mnist.train.labels.shape)  # (55000, 10)

    # Init horovod
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf_x = tf.placeholder(tf.float32, [None, 28 * 28]) / 255.
    image = tf.reshape(tf_x,
                       [-1, 28, 28, 1])  # (batch, height, width, channel)
    tf_y = tf.placeholder(tf.int32, [None, 10])  # input y

    # get global step
    global_step = tf.train.get_or_create_global_step()

    # CNN
    conv1 = tf.layers.conv2d(  # shape (28, 28, 1)
        inputs=image,
        filters=16,
        kernel_size=5,
        strides=1,
        padding='same',
        activation=tf.nn.relu)  # -> (28, 28, 16)
    pool1 = tf.layers.max_pooling2d(
        conv1,
        pool_size=2,
        strides=2,
    )  # -> (14, 14, 16)
    conv2 = tf.layers.conv2d(pool1, 32, 5, 1, 'same',
                             activation=tf.nn.relu)  # -> (14, 14, 32)
    pool2 = tf.layers.max_pooling2d(conv2, 2, 2)  # -> (7, 7, 32)
    flat = tf.reshape(pool2, [-1, 7 * 7 * 32])  # -> (7*7*32, )
    output = tf.layers.dense(flat, 10)  # output layer

    accuracy = tf.metrics.accuracy(  # return (acc, update_op), and create 2 local variables
        labels=tf.argmax(tf_y, axis=1),
        predictions=tf.argmax(output, axis=1),
    )[1]

    loss = tf.losses.softmax_cross_entropy(onehot_labels=tf_y,
                                           logits=output)  # compute cost
    optimizer = tf.train.AdamOptimizer(LR *
                                       hvd.size())  # Increase learning rate
    optimizer = hvd.DistributedOptimizer(
        optimizer)  # Add Horovod Distributed Optimizer
    train_op = optimizer.minimize(loss, global_step=global_step)

    # define hooks
    hooks = [
        hvd.BroadcastGlobalVariablesHook(0),
        tf.train.StopAtStepHook(last_step=600 // hvd.size()),
    ]
    if hvd.rank() == 0:
        hooks.append(
            tf.train.LoggingTensorHook(tensors={
                'step': global_step,
                'loss': loss
            },
                                       every_n_iter=10))

    # Use MonitoredTrainingSession
    with tf.train.MonitoredTrainingSession(config=config,
                                           hooks=hooks) as mon_sess:
        start = BATCH_SIZE * hvd.rank()
        end = BATCH_SIZE * (hvd.rank() + 1)
        print(start, end, BATCH_SIZE * hvd.size())
        while not mon_sess.should_stop():
            b_x, b_y = mnist.train.next_batch(BATCH_SIZE * hvd.size())
            b_x, b_y = b_x[start:end], b_y[start:end]
            mon_sess.run([train_op, loss], {tf_x: b_x, tf_y: b_y})
Exemple #15
0
    def __init__(self,
                 league_mgr_addr,
                 model_pool_addrs,
                 learner_ports,
                 rm_size,
                 batch_size,
                 ob_space,
                 ac_space,
                 policy,
                 gpu_id,
                 policy_config={},
                 ent_coef=1e-2,
                 distill_coef=1e-2,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 rwd_shape=False,
                 pub_interval=500,
                 log_interval=100,
                 save_interval=0,
                 total_timesteps=5e7,
                 burn_in_timesteps=0,
                 learner_id='',
                 batch_worker_num=4,
                 pull_worker_num=2,
                 unroll_length=32,
                 rollout_length=1,
                 use_mixed_precision=False,
                 use_sparse_as_dense=True,
                 adam_beta1=0.9,
                 adam_beta2=0.999,
                 adam_eps=1e-5,
                 data_type=PGData,
                 data_server_version='v1',
                 decode=False,
                 log_infos_interval=20,
                 **kwargs):
        super(PGLearner, self).__init__(league_mgr_addr, model_pool_addrs,
                                        learner_ports, learner_id)

        self.LR = tf.placeholder(tf.float32, [])
        """Learning Rate"""

        self.CLIPRANGE = tf.placeholder(tf.float32, [])
        """Learning Rate Clip Range"""

        self.ep_loss_coef = {}
        """Coefficients for those losses from the endpoints. Override it in derived
     class."""

        # TODO(pengsun): fix the policy_config default value
        self._init_const(total_timesteps, burn_in_timesteps, batch_size,
                         unroll_length, rwd_shape, ent_coef, vf_coef,
                         pub_interval, log_interval, save_interval, policy,
                         distill_coef, policy_config, rollout_length)

        # allow_soft_placement=True can fix issue when some op cannot be defined on
        # GPUs for tf-1.8.0; tf-1.13.1 does not have this issue
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(gpu_id)
        self.sess = tf.Session(config=config)
        self.rank = hvd.rank() if has_hvd else 0

        # Prepare dataset
        ds = data_type(ob_space,
                       ac_space,
                       self.n_v,
                       use_lstm=self.rnn,
                       hs_len=self.hs_len,
                       distillation=self.distillation,
                       version='v2')
        self._data_server = DataServer(self._pull_data,
                                       rm_size,
                                       unroll_length,
                                       batch_size,
                                       ds,
                                       gpu_id_list=(0, ),
                                       batch_worker_num=batch_worker_num,
                                       pull_worker_num=pull_worker_num,
                                       rollout_length=rollout_length,
                                       prefetch_buffer_size=2,
                                       version=data_server_version,
                                       decode=decode,
                                       log_infos_interval=log_infos_interval)

        # prepare net config
        net_config = policy.net_config_cls(ob_space, ac_space, **policy_config)
        net_config.clip_range = self.CLIPRANGE
        if rwd_shape:
            # make net_config.reward-shaping-weights a tf.placeholder so as to change
            # it during training.
            # NOTE: Assume there is reward_weights_shape in net_config
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            reward_weights_shape = net_config.reward_weights_shape
            self.rwd_weights = tf.placeholder(tf.float32, reward_weights_shape)
            net_config.reward_weights = self.rwd_weights
        if hasattr(net_config, 'lam'):
            # make net_config.lambda-for-td-lambda a tf.placeholder so as to change it
            #  during training.
            # TODO(pengsun): use NetInputsData instead of this quick-and-dirty hacking?
            self.LAM = tf.placeholder(tf.float32, [])
            net_config.lam = self.LAM
        else:
            self.LAM = None

        # build the policy net
        with tf.variable_scope('model', reuse=tf.AUTO_REUSE) as model_scope:
            pass

        def create_policy(inputs, nc):
            return policy.net_build_fun(inputs=inputs,
                                        nc=nc,
                                        scope=model_scope)

        device = '/gpu:{}'.format(0)
        with tf.device(device):
            input_data = self._data_server.input_datas[0]
            if 'use_xla' in policy_config and policy_config['use_xla']:
                try:
                    # Use tensorflow's accerlated linear algebra compile method
                    with tf.xla.experimental.jit_scope(True):
                        model = create_policy(input_data, net_config)
                except:
                    logger.log(
                        "WARNING: using tf.xla requires tf version>=1.15.")
                    model = create_policy(input_data, net_config)
            else:
                model = create_policy(input_data, net_config)
            loss, vf_loss, losses = self.build_loss(model, input_data)
        if has_hvd:
            self.losses = [hvd.allreduce(loss) for loss in losses]
        else:
            self.losses = list(losses)
        self.params = tf.trainable_variables(scope='model')
        self.params_vf = tf.trainable_variables(scope='model/vf')
        self.param_norm = tf.global_norm(self.params)

        self.trainer = tf.train.AdamOptimizer(learning_rate=self.LR,
                                              beta1=adam_beta1,
                                              beta2=adam_beta2,
                                              epsilon=adam_eps)
        self.burn_in_trainer = tf.train.AdamOptimizer(
            learning_rate=self.LR, epsilon=1e-5)  # same as default and IL
        if use_mixed_precision:
            try:
                self.trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.trainer)
                self.burn_in_trainer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
                    self.burn_in_trainer)
            except:
                logger.warn(
                    "using tf mixed_precision requires tf version>=1.15.")
        if has_hvd:
            self.trainer = hvd.DistributedOptimizer(
                self.trainer, sparse_as_dense=use_sparse_as_dense)
            self.burn_in_trainer = hvd.DistributedOptimizer(
                self.burn_in_trainer, sparse_as_dense=use_sparse_as_dense)
        grads_and_vars = self.trainer.compute_gradients(loss, self.params)
        grads_and_vars_vf = self.burn_in_trainer.compute_gradients(
            vf_loss, self.params_vf)
        clip_vars = model.vars.lstm_vars
        grads_and_vars, self.clip_grad_norm, self.nonclip_grad_norm = self.clip_grads_vars(
            grads_and_vars, clip_vars, max_grad_norm)
        grads_and_vars_vf, self.clip_grad_norm_vf, self.nonclip_grad_norm_vf = self.clip_grads_vars(
            grads_and_vars_vf, clip_vars, max_grad_norm)

        self._train_batch = self.trainer.apply_gradients(grads_and_vars)
        self._burn_in = self.burn_in_trainer.apply_gradients(grads_and_vars_vf)
        self.loss_endpoints_names = model.loss.loss_endpoints.keys()
        self._build_ops()
        if has_hvd:
            barrier_op = hvd.allreduce(tf.Variable(0.))
            broadcast_op = hvd.broadcast_global_variables(0)
        tf.global_variables_initializer().run(session=self.sess)
        self.sess.graph.finalize()

        self.barrier = lambda: self.sess.run(barrier_op) if has_hvd else None
        self.broadcast = lambda: self.sess.run(broadcast_op
                                               ) if has_hvd else None
        self.broadcast()
        # logging stuff
        format_strs = (['stdout', 'log', 'tensorboard', 'csv'] if self.rank
                       == 0 else ['stdout', 'log', 'tensorboard', 'csv'])
        logger.configure(dir='training_log/{}rank{}'.format(
            self._learner_id, self.rank),
                         format_strs=format_strs)
dataset = dataset.repeat(100)
iterator = dataset.make_one_shot_iterator()
next_item = iterator.get_next()

# Define the model
slope = tf.Variable(np.random.randn())
offset = tf.Variable(np.random.randn())

x, y = next_item  # The model is the continuation of the pipeline

y_hat = slope * x + offset

loss = tf.losses.mean_squared_error(y_hat, y)

opt = tf.train.GradientDescentOptimizer(.5)
train = hvd.DistributedOptimizer(opt).minimize(loss)

hooks = [hvd.BroadcastGlobalVariablesHook(0)]

history = []

with tf.train.MonitoredTrainingSession(hooks=hooks) as sess:
    # Initialization of the variables `slope` and `offset`
    # is done automatically by tf.train.MonitoredTrainingSession
    print(
        'rank', hvd.rank(),
        'inital slope   = %12.6f\n       initial offset = %12.6f' % sess.run(
            (slope, offset)))
    while not sess.should_stop():
        _, loss_val, m, n = sess.run((train, loss, slope, offset))
        history.append([sess.run(slope), sess.run(offset), loss_val])
Exemple #17
0
    def __call__(self, features, labels, mode, params):

        if mode == tf.estimator.ModeKeys.TRAIN:
            mandatory_params = ["batch_size", "lr_init", "num_gpus", "steps_per_epoch",
                                "momentum", "weight_decay", "loss_scale", "label_smoothing"]
            for p in mandatory_params:
                if p not in params:
                    raise RuntimeError("Parameter {} is missing.".format(p))

        if mode == tf.estimator.ModeKeys.TRAIN and not self.model_hparams.use_dali:

            with tf.device('/cpu:0'):
                # Stage inputs on the host
                cpu_prefetch_op, (features, labels) = self._stage([features, labels])

            with tf.device('/gpu:0'):
                # Stage inputs to the device
                gpu_prefetch_op, (features, labels) = self._stage([features, labels])

        with tf.device("/gpu:0"):

            if features.dtype != self.model_hparams.dtype:
                features = tf.cast(features, self.model_hparams.dtype)

            # Subtract mean per channel
            # and enforce values between [-1, 1]
            if not self.model_hparams.use_dali:
                features = normalized_inputs(features)

            mixup = 0
            eta = 0
            
            if mode == tf.estimator.ModeKeys.TRAIN:        
                eta = params['label_smoothing']
                mixup = params['mixup']
                
            if mode != tf.estimator.ModeKeys.PREDICT: 
                one_hot_smoothed_labels = tf.one_hot(labels, 1001, 
                                                     on_value = 1 - eta + eta/1001,
                                                     off_value = eta/1001)
                if mixup != 0:

                    print("Using mixup training with beta=", params['mixup'])
                    beta_distribution = tf.distributions.Beta(params['mixup'], params['mixup'])

                    feature_coefficients = beta_distribution.sample(sample_shape=[params['batch_size'], 1, 1, 1])      

                    reversed_feature_coefficients = tf.subtract(tf.ones(shape=feature_coefficients.shape), feature_coefficients)

                    rotated_features = tf.reverse(features, axis=[0])      

                    features = feature_coefficients * features + reversed_feature_coefficients * rotated_features

                    label_coefficients = tf.squeeze(feature_coefficients, axis=[2, 3])

                    rotated_labels = tf.reverse(one_hot_smoothed_labels, axis=[0])    

                    reversed_label_coefficients = tf.subtract(tf.ones(shape=label_coefficients.shape), label_coefficients)

                    one_hot_smoothed_labels = label_coefficients * one_hot_smoothed_labels + reversed_label_coefficients * rotated_labels
                
                
            # Update Global Step
            global_step = tf.train.get_or_create_global_step()
            tf.identity(global_step, name="global_step_ref")

            tf.identity(features, name="features_ref")
            
            if mode == tf.estimator.ModeKeys.TRAIN:
                tf.identity(labels, name="labels_ref")

            probs, logits = self.build_model(
                features,
                training=mode == tf.estimator.ModeKeys.TRAIN,
                reuse=False
            )

            y_preds = tf.argmax(logits, axis=1, output_type=tf.int32)

            # Check the output dtype, shall be FP32 in training
            assert (probs.dtype == tf.float32)
            assert (logits.dtype == tf.float32)
            assert (y_preds.dtype == tf.int32)

            tf.identity(logits, name="logits_ref")
            tf.identity(probs, name="probs_ref")
            tf.identity(y_preds, name="y_preds_ref")

            #if mode == tf.estimator.ModeKeys.TRAIN:
            #    
            #    assert (len(tf.trainable_variables()) == 161)
            #
            #else:
            #    
            #    assert (len(tf.trainable_variables()) == 0)


        if mode == tf.estimator.ModeKeys.PREDICT:

            predictions = {'classes': y_preds, 'probabilities': probs}

            return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions,
                export_outputs={'predict': tf.estimator.export.PredictOutput(predictions)}
            )

        else:

            with tf.device("/gpu:0"):

                if mode == tf.estimator.ModeKeys.TRAIN:
                    acc_top1 = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
                    acc_top5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5)

                else:
                    acc_top1, acc_top1_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=1))
                    acc_top5, acc_top5_update_op = tf.metrics.mean(tf.nn.in_top_k(predictions=logits, targets=labels, k=5))

                tf.identity(acc_top1, name="acc_top1_ref")
                tf.identity(acc_top5, name="acc_top5_ref")

                predictions = {
                    'classes': y_preds,
                    'probabilities': probs,
                    'accuracy_top1': acc_top1,
                    'accuracy_top5': acc_top5
                }
                
                cross_entropy = tf.losses.softmax_cross_entropy(
                    logits=logits, onehot_labels=one_hot_smoothed_labels)

                assert (cross_entropy.dtype == tf.float32)
                tf.identity(cross_entropy, name='cross_entropy_loss_ref')

                def loss_filter_fn(name):
                    """we don't need to compute L2 loss for BN and bias (eq. to add a cste)"""
                    return all([
                        tensor_name not in name.lower()
                        # for tensor_name in ["batchnorm", "batch_norm", "batch_normalization", "bias"]
                        for tensor_name in ["batchnorm", "batch_norm", "batch_normalization"]
                    ])

                filtered_params = [tf.cast(v, tf.float32) for v in tf.trainable_variables() if loss_filter_fn(v.name)]

                if len(filtered_params) != 0:

                    l2_loss_per_vars = [tf.nn.l2_loss(v) for v in filtered_params]
                    l2_loss = tf.multiply(tf.add_n(l2_loss_per_vars), params["weight_decay"])

                else:
                    l2_loss = tf.zeros(shape=(), dtype=tf.float32)

                assert (l2_loss.dtype == tf.float32)
                tf.identity(l2_loss, name='l2_loss_ref')

                total_loss = tf.add(cross_entropy, l2_loss, name="total_loss")

                assert (total_loss.dtype == tf.float32)
                tf.identity(total_loss, name='total_loss_ref')

                tf.summary.scalar('cross_entropy', cross_entropy)
                tf.summary.scalar('l2_loss', l2_loss)
                tf.summary.scalar('total_loss', total_loss)
                
                if mode == tf.estimator.ModeKeys.TRAIN:

                    with tf.device("/cpu:0"):

                        learning_rate = learning_rate_scheduler(
                            lr_init=params["lr_init"],
                            lr_warmup_epochs=params["lr_warmup_epochs"],
                            global_step=global_step,
                            batch_size=params["batch_size"],
                            num_batches_per_epoch=params["steps_per_epoch"],
                            num_decay_steps=params["num_decay_steps"],
                            num_gpus=params["num_gpus"],
                            use_cosine_lr=params["use_cosine_lr"]
                        )

                    tf.identity(learning_rate, name='learning_rate_ref')
                    tf.summary.scalar('learning_rate', learning_rate)

                    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params["momentum"])

                    if params["apply_loss_scaling"]:
                        optimizer = FixedLossScalerOptimizer(optimizer, scale=params["loss_scale"])

                    if hvd_utils.is_using_hvd():
                        optimizer = hvd.DistributedOptimizer(optimizer)

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    if mode != tf.estimator.ModeKeys.TRAIN:
                        update_ops += [acc_top1_update_op, acc_top5_update_op]
                    
                    deterministic = True
                    gate_gradients = (tf.train.Optimizer.GATE_OP if deterministic else tf.train.Optimizer.GATE_NONE)

                    backprop_op = optimizer.minimize(total_loss, gate_gradients=gate_gradients, global_step=global_step)

                    
                    if self.model_hparams.use_dali:
                        train_ops = tf.group(backprop_op, update_ops, name='train_ops')
                    else:
                        train_ops = tf.group(backprop_op, cpu_prefetch_op, gpu_prefetch_op, update_ops, name='train_ops')

                    return tf.estimator.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_ops)

                elif mode == tf.estimator.ModeKeys.EVAL:
                    eval_metrics = {
                        "top1_accuracy": (acc_top1, acc_top1_update_op),
                        "top5_accuracy": (acc_top5, acc_top5_update_op)
                    }

                    return tf.estimator.EstimatorSpec(
                        mode=mode,
                        predictions=predictions,
                        loss=total_loss,
                        eval_metric_ops=eval_metrics
                    )

                else:
                    raise NotImplementedError('Unknown mode {}'.format(mode))
Exemple #18
0
 def _build_optimizer(self, loss):
     optimizer = tf.train.AdamOptimizer(learning_rate=self.lr,
                                        beta1=self.beta1,
                                        beta2=self.beta2)
     return hvd.DistributedOptimizer(optimizer).minimize(
         loss, global_step=tf.train.get_or_create_global_step())
Exemple #19
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 l2_coef,
                 cliprange,
                 adam_epsilon=1e-6,
                 load_path=None,
                 test_mode=False):
        sess = tf.get_default_session()

        act_model = policy(sess,
                           ob_space,
                           ac_space,
                           nenv,
                           1,
                           test_mode=test_mode,
                           reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenv,
                             nsteps,
                             test_mode=test_mode,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nenv * nsteps],
                                                  name='action')
        ADV = tf.placeholder(tf.float32, [nenv * nsteps], name='advantage')
        VALID = tf.placeholder(tf.float32, [nenv * nsteps], name='valid')
        R = tf.placeholder(tf.float32, [nenv * nsteps], name='return')
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [nenv * nsteps],
                                      name='neglogprob')
        OLDVPRED = tf.placeholder(tf.float32, [nenv * nsteps],
                                  name='valuepred')
        LR = tf.placeholder(tf.float32, [], name='lr')

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(VALID * train_model.pd.entropy())
        vpred = train_model.vf

        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED,
                                                   -cliprange, cliprange)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(
            VALID * tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - cliprange,
                                             1.0 + cliprange)
        pg_loss = tf.reduce_mean(VALID * tf.maximum(pg_losses, pg_losses2))
        mv = tf.reduce_mean(VALID)
        approxkl = .5 * tf.reduce_mean(
            VALID * tf.square(neglogpac - OLDNEGLOGPAC)) / mv
        clipfrac = tf.reduce_mean(VALID * tf.to_float(
            tf.greater(tf.abs(ratio - 1.0), cliprange))) / mv
        params = tf.trainable_variables()
        l2_loss = .5 * sum([tf.reduce_sum(tf.square(p)) for p in params])
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_coef * l2_loss

        opt = tf.train.AdamOptimizer(LR, epsilon=adam_epsilon)
        opt = hvd.DistributedOptimizer(opt)
        train_op = opt.minimize(loss)

        def train(lr,
                  obs,
                  returns,
                  advs,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  valids,
                  increase_ent,
                  states=None):
            td_map = {
                LR: lr,
                train_model.X: obs,
                A: actions,
                ADV: advs,
                VALID: valids,
                R: returns,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                train_model.E: increase_ent
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([
                pg_loss, vf_loss, l2_loss, entropy, approxkl, clipfrac,
                train_op
            ],
                            feed_dict=td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'l2_loss', 'policy_entropy',
            'approxkl', 'clipfrac'
        ]

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        sess.run(tf.global_variables_initializer())
        if load_path and hvd.rank() == 0:
            self.load(load_path)
        sess.run(hvd.broadcast_global_variables(0))
        tf.get_default_graph().finalize()
Exemple #20
0
def train(*,
          flow_constructor,
          logdir,
          lr_schedule,
          dropout_p,
          seed,
          init_bs,
          total_bs,
          ema_decay,
          steps_per_log,
          epochs_per_val,
          max_grad_norm,
          dtype=tf.float32,
          scale_loss=None,
          restore_checkpoint=None,
          scale_grad=None,
          dataset='cifar10',
          steps_per_extra_samples=None):
    hvd, MPI, is_root, mpi_average = setup_horovod()

    # Seeding and logging setup
    seed_all(hvd.rank() + hvd.size() * seed)
    assert total_bs % hvd.size() == 0
    local_bs = total_bs // hvd.size()

    logger = None
    logdir = '{}_mpi{}_{}'.format(os.path.expanduser(logdir), hvd.size(),
                                  time.time())
    checkpointdir = os.path.join(logdir, 'checkpoints')
    if is_root:
        print('Floating point format:', dtype)
        pprint(locals())
        os.makedirs(logdir)
        os.makedirs(checkpointdir)
        logger = TensorBoardOutput(logdir)

    # Load data
    if is_root:
        # Load once on root first to prevent downloading conflicts
        print('Loading data')
        load_data(dataset=dataset, dtype=dtype.as_numpy_dtype)
    MPI.COMM_WORLD.Barrier()
    data_train, data_val = load_data(dataset=dataset,
                                     dtype=dtype.as_numpy_dtype)
    img_shp = list(data_train.shape[1:])
    if is_root:
        print('Training data: {}, Validation data: {}'.format(
            data_train.shape[0], data_val.shape[0]))
        print('Image shape:', img_shp)
    bpd_scale_factor = 1. / (np.log(2) * np.prod(img_shp))

    # Build graph
    if is_root: print('Building graph')
    dequant_flow, flow = flow_constructor()
    # Data-dependent init
    if is_root: print('===== Init graph =====')
    x_init_sym = tf.placeholder(dtype, [init_bs] + img_shp)
    _, _, init_loss_sym, _ = build_forward(x=x_init_sym,
                                           dequant_flow=dequant_flow,
                                           flow=flow,
                                           flow_kwargs=dict(
                                               vcfg=VarConfig(init=True,
                                                              ema=None,
                                                              dtype=dtype),
                                               dropout_p=dropout_p,
                                               verbose=is_root))
    # Training
    if is_root: print('===== Training graph =====')
    x_sym = tf.placeholder(dtype, [local_bs] + img_shp)
    _, y_sym, loss_sym, _ = build_forward(x=x_sym,
                                          dequant_flow=dequant_flow,
                                          flow=flow,
                                          flow_kwargs=dict(vcfg=VarConfig(
                                              init=False,
                                              ema=None,
                                              dtype=dtype),
                                                           dropout_p=dropout_p,
                                                           verbose=is_root))

    # EMA
    params = tf.trainable_variables()
    if is_root:
        # for p in params:
        #     print(p.name, p.shape)
        print('Parameters',
              sum(np.prod(p.get_shape().as_list()) for p in params))
    ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
    maintain_averages_op = tf.group(ema.apply(params))
    # Op for setting the ema params to the current non-ema params (for use after data-dependent init)
    name2var = {v.name: v for v in tf.global_variables()}
    copy_params_to_ema = tf.group([
        name2var[p.name.replace(':0', '') +
                 '/ExponentialMovingAverage:0'].assign(p) for p in params
    ])

    # Validation and sampling (with EMA)
    if is_root: print('===== Validation graph =====')
    val_flow_kwargs = dict(vcfg=VarConfig(init=False, ema=ema, dtype=dtype),
                           dropout_p=0,
                           verbose=is_root)
    val_dequant_x_sym, val_y_sym, val_loss_sym, _ = build_forward(
        x=x_sym,
        dequant_flow=dequant_flow,
        flow=flow,
        flow_kwargs=val_flow_kwargs)
    # for debugging invertibility
    val_inverr_sym = tf.reduce_max(
        tf.abs(val_dequant_x_sym -
               flow.inverse(val_y_sym, **val_flow_kwargs)[0]))

    if is_root: print('===== Sampling graph =====')
    samples_sym, _ = flow.inverse(
        tf.random_normal(y_sym.shape.as_list(), dtype=dtype),
        **val_flow_kwargs)
    allgathered_samples_sym = hvd.allgather(tf.to_float(samples_sym))
    assert len(tf.trainable_variables()) == len(params)

    def run_validation(sess, i_step):
        data_val_shard = np.array_split(data_val, hvd.size(),
                                        axis=0)[hvd.rank()]
        shard_losses, shard_inverrs = zip(*[
            sess.run([val_loss_sym, val_inverr_sym], {x_sym: val_batch})
            for val_batch, in iterbatches([data_val_shard],
                                          batch_size=local_bs,
                                          include_final_partial_batch=False)
        ])
        val_loss, total_count = mpi_average(shard_losses)
        inv_err, _ = mpi_average(shard_inverrs)
        samples = sess.run(allgathered_samples_sym)
        if is_root:
            logger.writekvs(
                [('val_bpd', bpd_scale_factor * val_loss),
                 ('val_inverr', inv_err),
                 ('num_val_examples', total_count * local_bs),
                 ('samples',
                  tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))],
                i_step)

    def run_sampling_only(sess, i_step):
        samples = sess.run(allgathered_samples_sym)
        if is_root:
            logger.writekvs(
                [('samples',
                  tile_imgs(np.clip(samples, 0, 255).astype(np.uint8)))],
                i_step)

    # Optimization
    lr_sym = tf.placeholder(dtype, [], 'lr')
    optimizer = hvd.DistributedOptimizer(tf.train.AdamOptimizer(lr_sym))

    if scale_loss is None:
        grads_and_vars = optimizer.compute_gradients(loss_sym, var_list=params)
    else:
        grads_and_vars = [(g / scale_loss, v)
                          for (g, v) in optimizer.compute_gradients(
                              loss_sym * scale_loss, var_list=params)]

    if scale_grad is not None:
        grads_and_vars = [(g / scale_grad, v) for (g, v) in grads_and_vars]
    if max_grad_norm is not None:
        clipped_grads, grad_norm_sym = tf.clip_by_global_norm(
            [g for (g, _) in grads_and_vars], max_grad_norm)
        grads_and_vars = [
            (cg, v) for (cg, (_, v)) in zip(clipped_grads, grads_and_vars)
        ]
    else:
        grad_norm_sym = tf.constant(0.)
    opt_sym = tf.group(optimizer.apply_gradients(grads_and_vars),
                       maintain_averages_op)

    def loop(sess: tf.Session):
        i_step = 0

        if is_root: print('Initializing')
        sess.run(tf.global_variables_initializer())
        if restore_checkpoint is not None:
            # Restore from checkpoint
            if is_root:
                saver = tf.train.Saver()
                print('Restoring checkpoint:', restore_checkpoint)
                restore_step = int(restore_checkpoint.split('-')[-1])
                print('Restoring from step:', restore_step)
                saver.restore(sess, restore_checkpoint)
                i_step = restore_step
            else:
                saver = None
        else:
            # No checkpoint: perform data dependent initialization
            if is_root: print('Data dependent init')
            init_loss = sess.run(
                init_loss_sym, {
                    x_init_sym:
                    data_train[np.random.randint(0, data_train.shape[0],
                                                 init_bs)]
                })
            if is_root: print('Init loss:', init_loss * bpd_scale_factor)
            sess.run(copy_params_to_ema)
            saver = tf.train.Saver() if is_root else None
        if is_root: print('Broadcasting initial parameters')
        sess.run(hvd.broadcast_global_variables(0))
        sess.graph.finalize()

        if is_root:
            print('Training')

        loss_hist = deque(maxlen=steps_per_log)
        gnorm_hist = deque(maxlen=steps_per_log)
        for i_epoch in range(99999999999):
            if i_epoch % epochs_per_val == 0:
                run_validation(sess, i_step=i_step)
                if saver is not None:
                    saver.save(sess,
                               os.path.join(checkpointdir, 'model'),
                               global_step=i_step)

            epoch_start_t = time.time()
            for i_epoch_step, (batch, ) in enumerate(
                    iterbatches(  # non-sharded: each gpu goes through the whole dataset
                        [data_train],
                        batch_size=local_bs,
                        include_final_partial_batch=False,
                    )):

                if steps_per_extra_samples is not None and i_step % steps_per_extra_samples == 0:
                    run_sampling_only(sess, i_step)

                lr = lr_schedule(i_step)
                loss, gnorm, _ = sess.run([loss_sym, grad_norm_sym, opt_sym], {
                    x_sym: batch,
                    lr_sym: lr
                })
                loss_hist.append(loss)
                gnorm_hist.append(gnorm)

                # Skip timing the very first step, which will be unusually slow due to TF initialization
                if i_epoch == i_epoch_step == 0:
                    epoch_start_t = time.time()

                if i_step % steps_per_log == 0:
                    loss_hist_means = MPI.COMM_WORLD.gather(float(
                        np.mean(loss_hist)),
                                                            root=0)
                    gnorm_hist_means = MPI.COMM_WORLD.gather(float(
                        np.mean(gnorm_hist)),
                                                             root=0)
                    steps_per_sec = (i_epoch_step + 1) / (time.time() -
                                                          epoch_start_t)
                    if is_root:
                        kvs = [
                            ('iter', i_step),
                            ('epoch', i_epoch + i_epoch_step * local_bs /
                             data_train.shape[0]),  # epoch for this gpu
                            ('bpd',
                             float(
                                 np.mean(loss_hist_means) * bpd_scale_factor)),
                            ('gnorm', float(np.mean(gnorm_hist_means))),
                            ('lr', float(lr)),
                            ('fps', steps_per_sec * total_bs
                             ),  # fps calculated over all gpus (this epoch)
                            ('sps', steps_per_sec),
                        ]
                        logger.writekvs(kvs, i_step)
                i_step += 1
            # End of epoch

    # Train
    config = tf.ConfigProto()
    # config.log_device_placement = True
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(
        hvd.local_rank())  # Pin GPU to local rank (one GPU per process)
    with tf.Session(config=config) as sess:
        loop(sess)
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    # Keras automatically creates a cache directory in ~/.keras/datasets for
    # storing the downloaded MNIST data. This creates a race
    # condition among the workers that share the same filesystem. If the
    # directory already exists by the time this worker gets around to creating
    # it, ignore the resulting exception and continue.
    cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets')
    if not os.path.exists(cache_dir):
        try:
            os.mkdir(cache_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
                pass
            else:
                raise

    # Download and load MNIST dataset.
    (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank())

    # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
    # into (-1, 784) to feed into our network. Also, need to normalize the
    # features between 0 and 1.
    x_train = np.reshape(x_train, (-1, 784)) / 255.0
    x_test = np.reshape(x_test, (-1, 784)) / 255.0

    # Build model...
    with tf.name_scope('input'):
        image = tf.placeholder(tf.float32, [None, 784], name='image')
        label = tf.placeholder(tf.float32, [None], name='label')
    predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)

    lr_scaler = hvd.size()
    # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL,
    # scale lr by local_size
    if args.use_adasum:
        lr_scaler = hvd.local_size() if hvd.nccl_built() else 1

    # Horovod: adjust learning rate based on lr_scaler.
    opt = tf.train.AdamOptimizer(args.lr * lr_scaler)

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(
        opt, op=hvd.Adasum if args.use_adasum else hvd.Average)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of GPUs.
        tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
    training_batch_generator = train_input_generator(x_train,
                                                     y_train,
                                                     batch_size=100)
    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks,
                                           config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = next(training_batch_generator)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})
Exemple #22
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    if params.get('img_summary_steps', None):
        utils.image('input_image', features)
    training_hooks = None
    if params['data_format'] == 'channels_first':
        features = tf.transpose(features, [0, 3, 1, 2])

    def _model_outputs(inputs):
        # Convert params (dict) to Config for easier access.
        return model(inputs, config=hparams_config.Config(params))

    cls_outputs, box_outputs = utils.build_model_with_precision(
        params['precision'], _model_outputs, features,
        params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float64)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float64)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if box_iou_loss:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()
    if params['strategy'] == 'horovod':
        import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
        learning_rate = learning_rate * hvd.size()
    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        elif params['strategy'] == 'horovod':
            optimizer = hvd.DistributedOptimizer(optimizer)
            training_hooks = [hvd.BroadcastGlobalVariablesHook(0)]

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', 0) > 0:
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                clipped_grads, gnorm = tf.clip_by_global_norm(
                    grads, params['clip_gradients_norm'])
                utils.scalar('gnorm', gnorm)
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            batch_size = params['batch_size']
            if params['strategy'] == 'tpu':
                batch_size = params['batch_size'] * params['num_shards']
            eval_anchors = anchors.Anchors(params['min_level'],
                                           params['max_level'],
                                           params['num_scales'],
                                           params['aspect_ratios'],
                                           params['anchor_scale'],
                                           params['image_size'])
            anchor_labeler = anchors.AnchorLabeler(eval_anchors,
                                                   params['num_classes'])
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                coco_metrics = coco_metric_fn(
                    batch_size,
                    anchor_labeler,
                    params['val_json_file'],
                    testdev_dir=params['testdev_dir'],
                    disable_pyfun=params.get('disable_pyfun', None),
                    **kwargs)
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                coco_metrics = coco_metric_fn(batch_size, anchor_labeler,
                                              params['val_json_file'],
                                              **kwargs)

            # Add metrics to output.
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'source_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
        }
        add_metric_fn_inputs(params, cls_outputs, box_outputs,
                             metric_fn_inputs)
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint,
                                             ckpt_scope=ckpt_scope,
                                             var_scope=var_scope,
                                             var_exclude_expr=params.get(
                                                 'var_exclude_expr', None))

            tf.train.init_from_checkpoint(checkpoint, var_map)

            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
                                             loss=total_loss,
                                             train_op=train_op,
                                             eval_metrics=eval_metrics,
                                             host_call=utils.get_tpu_host_call(
                                                 global_step, params),
                                             scaffold_fn=scaffold_fn,
                                             training_hooks=training_hooks)
Exemple #23
0
def main(_):
    """
    Builds the model and runs.
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    tx.utils.maybe_create_dir(FLAGS.output_dir)

    # Loads data
    num_train_data = config_data.num_train_data

    # Configures distribued mode
    if FLAGS.distributed:
        config_data.train_hparam["dataset"]["num_shards"] = hvd.size()
        config_data.train_hparam["dataset"]["shard_id"] = hvd.rank()
        config_data.train_hparam["batch_size"] //= hvd.size()

    train_dataset = tx.data.TFRecordData(hparams=config_data.train_hparam)
    eval_dataset = tx.data.TFRecordData(hparams=config_data.eval_hparam)
    test_dataset = tx.data.TFRecordData(hparams=config_data.test_hparam)

    iterator = tx.data.FeedableDataIterator({
        'train': train_dataset,
        'eval': eval_dataset,
        'test': test_dataset
    })
    batch = iterator.get_next()
    input_ids = batch["input_ids"]
    segment_ids = batch["segment_ids"]
    batch_size = tf.shape(input_ids)[0]
    input_length = tf.reduce_sum(1 - tf.cast(tf.equal(input_ids, 0), tf.int32),
                                 axis=1)
    # Builds BERT
    hparams = {'clas_strategy': 'cls_time'}
    model = tx.modules.BERTClassifier(
        pretrained_model_name=FLAGS.pretrained_model_name, hparams=hparams)
    logits, preds = model(input_ids, input_length, segment_ids)

    accu = tx.evals.accuracy(batch['label_ids'], preds)

    # Optimization
    loss = tf.losses.sparse_softmax_cross_entropy(labels=batch["label_ids"],
                                                  logits=logits)
    global_step = tf.Variable(0, trainable=False)

    # Builds learning rate decay scheduler
    static_lr = config_downstream.lr['static_lr']
    num_train_steps = int(num_train_data / config_data.train_batch_size *
                          config_data.max_train_epoch)
    num_warmup_steps = int(num_train_steps * config_data.warmup_proportion)
    lr = model_utils.get_lr(
        global_step,
        num_train_steps,  # lr is a Tensor
        num_warmup_steps,
        static_lr)

    opt = tx.core.get_optimizer(global_step=global_step,
                                learning_rate=lr,
                                hparams=config_downstream.opt)

    if FLAGS.distributed:
        opt = hvd.DistributedOptimizer(opt)

    train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                               global_step=global_step,
                                               learning_rate=None,
                                               optimizer=opt)

    # Train/eval/test routine

    def _is_head():
        if not FLAGS.distributed:
            return True
        return hvd.rank() == 0

    def _train_epoch(sess):
        """Trains on the training set, and evaluates on the dev set
        periodically.
        """
        iterator.restart_dataset(sess, 'train')

        fetches = {
            'train_op': train_op,
            'loss': loss,
            'batch_size': batch_size,
            'step': global_step
        }

        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'train'),
                    tx.global_mode(): tf.estimator.ModeKeys.TRAIN,
                }
                rets = sess.run(fetches, feed_dict)
                step = rets['step']

                dis_steps = config_data.display_steps
                if _is_head() and dis_steps > 0 and step % dis_steps == 0:
                    tf.logging.info('step:%d; loss:%f;' % (step, rets['loss']))

                eval_steps = config_data.eval_steps
                if _is_head() and eval_steps > 0 and step % eval_steps == 0:
                    _eval_epoch(sess)

            except tf.errors.OutOfRangeError:
                break

    def _eval_epoch(sess):
        """Evaluates on the dev set.
        """
        iterator.restart_dataset(sess, 'eval')

        cum_acc = 0.0
        cum_loss = 0.0
        nsamples = 0
        fetches = {
            'accu': accu,
            'loss': loss,
            'batch_size': batch_size,
        }
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'eval'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.EVAL,
                }
                rets = sess.run(fetches, feed_dict)

                cum_acc += rets['accu'] * rets['batch_size']
                cum_loss += rets['loss'] * rets['batch_size']
                nsamples += rets['batch_size']
            except tf.errors.OutOfRangeError:
                break

        tf.logging.info('eval accu: {}; loss: {}; nsamples: {}'.format(
            cum_acc / nsamples, cum_loss / nsamples, nsamples))

    def _test_epoch(sess):
        """Does predictions on the test set.
        """
        iterator.restart_dataset(sess, 'test')

        _all_preds = []
        while True:
            try:
                feed_dict = {
                    iterator.handle: iterator.get_handle(sess, 'test'),
                    tx.context.global_mode(): tf.estimator.ModeKeys.PREDICT,
                }
                _preds = sess.run(preds, feed_dict=feed_dict)
                _all_preds.extend(_preds.tolist())
            except tf.errors.OutOfRangeError:
                break

        output_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
        with tf.gfile.GFile(output_file, "w") as writer:
            writer.write('\n'.join(str(p) for p in _all_preds))

    # Broadcasts global variables from rank-0 process
    if FLAGS.distributed:
        bcast = hvd.broadcast_global_variables(0)

    session_config = tf.ConfigProto()
    if FLAGS.distributed:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    with tf.Session(config=session_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        sess.run(tf.tables_initializer())

        if FLAGS.distributed:
            bcast.run()

        # Restores trained model if specified
        saver = tf.train.Saver()
        if FLAGS.checkpoint:
            saver.restore(sess, FLAGS.checkpoint)

        iterator.initialize_dataset(sess)

        if FLAGS.do_train:
            for i in range(config_data.max_train_epoch):
                _train_epoch(sess)
            saver.save(sess, FLAGS.output_dir + '/model.ckpt')

        if FLAGS.do_eval:
            _eval_epoch(sess)

        if FLAGS.do_test:
            _test_epoch(sess)
Exemple #24
0
def create_optimizer(loss,
                     init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     use_tpu,
                     use_hvd=False):
    """Creates an optimizer training op."""
    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    # Implements linear decay of the learning rate.
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=1.0,
                                              cycle=False)
    # if use_hvd:
    #   # May want to scale learning rate by number of GPUs
    #   learning_rate *= hvd.size()

    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
    # learning rate will be `global_step/num_warmup_steps * init_lr`.
    if num_warmup_steps:
        global_steps_int = tf.cast(global_step, tf.int32)
        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

        global_steps_float = tf.cast(global_steps_int, tf.float32)
        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

        warmup_percent_done = global_steps_float / warmup_steps_float
        warmup_learning_rate = init_lr * warmup_percent_done

        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
        learning_rate = ((1.0 - is_warmup) * learning_rate +
                         is_warmup * warmup_learning_rate)

    # It is recommended that you use this optimizer for fine tuning, since this
    # is how the model was trained (note that the Adam m/v variables are NOT
    # loaded from init_checkpoint.)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=0.01,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if use_hvd:
        # [HVD] Wrap the original optimizer by Horovod's distributed optimizer, which handles all the under the hood allreduce calls.
        # Notice Horovod only does synchronized parameter update.
        optimizer = hvd.DistributedOptimizer(optimizer)

    if use_tpu:
        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    if use_hvd:
        # [HVD] Use distributed optimizer to compute gradients
        grads_and_vars = optimizer.compute_gradients(loss, tvars)
        grads = [grad for grad, var in grads_and_vars]
        tvars = [var for grad, var in grads_and_vars]
    else:
        # Use standard TF gradients
        grads = tf.gradients(loss, tvars)

    # This is how the model was pre-trained.
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)

    # Normally the global step update is done inside of `apply_gradients`.
    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
    # a different optimizer, you should probably take this line out.
    new_global_step = global_step + 1
    new_global_step = tf.identity(new_global_step, name='step_update')
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
def train_main(dataset,
               model_name='117M',
               seed=None,
               batch_size=2,
               sample_length=1023,
               sample_num=1,
               sample_every=4500,
               run_name='run1',
               restore_from='latest',
               save_every=2000,
               combine=50000):

    enc = encoder_sp.get_encoder(model_name)
    hparams = model.default_hparams()
    with open(os.path.join('models', model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if sample_length is None:
        sample_length = hparams.n_ctx // 2
    elif sample_length > hparams.n_ctx:
        raise ValueError(
            "Can't get samples longer than window size: %s" % hparams.n_ctx)

    # TF config

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = model.model(hparams=hparams, X=context)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=context[:, 1:], logits=output['logits'][:, :-1]))

        tf_sample = sample.sample_sequence(
            hparams=hparams,
            length=sample_length,
            context=context,
            batch_size=batch_size,
            temperature=0.8,
            top_k=40)

        train_vars = [v for v in tf.trainable_variables() if 'model' in v.name]

        opt = tf.train.AdamOptimizer()
        opt = hvd.DistributedOptimizer(opt)
        train_op = opt.minimize(loss, var_list=train_vars)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        bcast = hvd.broadcast_global_variables(0)

        saver = tf.train.Saver(
            var_list=train_vars,
            max_to_keep=5,
            keep_checkpoint_every_n_hours=2)

        sess.run(tf.global_variables_initializer())


        if restore_from == 'latest':
            ckpt = tf.train.latest_checkpoint(
                os.path.join(CHECKPOINT_DIR, run_name))
            if ckpt is None:
                # Get fresh GPT weights if new run.
                ckpt = tf.train.latest_checkpoint(
                    os.path.join('models', model_name))
        elif restore_from == 'fresh':
            ckpt = tf.train.latest_checkpoint(
                os.path.join('models', model_name))
        else:
            ckpt = tf.train.latest_checkpoint(restore_from)
        print(str(hvd.local_rank()), 'Loading checkpoint', ckpt)
        saver.restore(sess, ckpt)

        bcast.run()

        print(str(hvd.local_rank()), 'Loading dataset...')
        chunks = load_dataset(enc, dataset, combine)
        data_sampler = Sampler(chunks)
        print(str(hvd.local_rank()), 'dataset has', data_sampler.total_size, 'tokens')
        print(str(hvd.local_rank()), 'Training...')

        counter = 1
        if os.path.exists(os.path.join(CHECKPOINT_DIR, run_name, 'counter')):
            # Load the step number if we're resuming a run
            # Add 1 so we don't immediately try to save again
            with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'),
                      'r') as fp:
                counter = int(fp.read()) + 1

        def save():
            maketree(os.path.join(CHECKPOINT_DIR, run_name))
            print(
                'Saving',
                os.path.join(CHECKPOINT_DIR, run_name,
                             'model-{}').format(counter))
            saver.save(
                sess,
                os.path.join(CHECKPOINT_DIR, run_name, 'model'),
                global_step=counter)
            with open(os.path.join(CHECKPOINT_DIR, run_name, 'counter'),
                      'w') as fp:
                fp.write(str(counter) + '\n')

        def generate_samples():
            context_tokens = data_sampler.sample(1)
            all_text = []
            index = 0
            while index < sample_num:
                out = sess.run(
                    tf_sample, feed_dict={context: batch_size*[context_tokens]})
                for i in range(min(sample_num - index, batch_size)):
                    text = enc.decode(out[i])
                    text = '======== SAMPLE {} ========\n{}\n'.format(index + 1, text)
                    all_text.append(text)
                    index += 1
            print(text)
            maketree(os.path.join(SAMPLE_DIR, run_name))
            with open(
                    os.path.join(SAMPLE_DIR, run_name,
                                 'samples-{}').format(counter), 'w') as fp:
                fp.write('\n'.join(all_text))

        avg_loss = (0.0, 0.0)
        start_time = time.time()

        try:
            while True:

                batch = [data_sampler.sample(1024) for _ in range(batch_size)]

                _, lv = sess.run((train_op, loss), feed_dict={context: batch})

                avg_loss = (avg_loss[0] * 0.99 + lv, avg_loss[1] * 0.99 + 1.0)

                if hvd.rank() == 0:
                    if counter % save_every == 0:
                        save()
                    if counter % sample_every == 0:
                        generate_samples()

                    print(
                        '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
                        .format(
                            counter=counter,
                            time=time.time() - start_time,
                            loss=lv,
                            avg=avg_loss[0] / avg_loss[1]))

                counter += 1

        except KeyboardInterrupt:
            print('interrupted')
            if hvd.rank() == 0:
                save()
Exemple #26
0
def main():
    """Create the model and start the training."""
    hvd.init()
    args = get_arguments()
    args.snapshot_dir = args.snapshot_dir.replace(
        'DeepDICD/', 'DeepDICD/' + args.model_name + '-' + args.domain + '-')
    print(toMagenta(args.snapshot_dir))
    ss = args.domain.split('-')
    if ss[0] == 'D':
        args.list1 = args.list1.replace('amazon.txt', 'dslr.txt')
    elif ss[0] == 'W':
        args.list1 = args.list1.replace('amazon.txt', 'webcam.txt')

    if ss[1] == 'A':
        args.list2 = args.list2.replace('dslr.txt', 'amazon.txt')
    elif ss[1] == 'W':
        args.list2 = args.list2.replace('dslr.txt', 'webcam.txt')

    print(toMagenta(args.list1))
    print(toMagenta(args.list2))

    start_steps = args.start_steps

    h = args.h
    w = args.w

    # construct data generator
    file1 = open(args.list1)
    num1 = len(file1.readlines())
    file2 = open(args.list2)
    num2 = len(file2.readlines())
    file1.close()
    file2.close()

    steps_per_epoch = int((num1 / (args.batch_size)))
    num_steps = int(steps_per_epoch * args.num_epochs)
    val_num_steps = int(num2 / args.batch_size)

    print(toCyan('src domain: {:d}, tar domain {:d}'.format(num1, num2)))
    print(
        toCyan('steps_per_epoch x num_epochs:{:d} x {:d}'.format(
            steps_per_epoch, args.num_epochs)))

    # Chong
    # split_batch_size=int(args.batch_size/hvd.size())
    myDataloader = Dataloader(args.img_dir, args.list1, args.list2,
                              args.batch_size, args.h, args.w,
                              args.num_threads)

    src_img = myDataloader.simg_batch
    src_label = myDataloader.slabel_batch
    tar_img = myDataloader.timg_batch
    tar_label = myDataloader.tlabel_batch

    coord = tf.train.Coordinator()

    # Using Poly learning rate policy
    baseLR1 = tf.constant(args.lr1)
    baseLR2 = tf.constant(args.lr2)
    step_ph = tf.placeholder(dtype=tf.float32, shape=())
    # lr1 = tf.scalar_mul(baseLR1, tf.pow((1 - step_ph / num_steps), args.power))
    # lr2 = tf.scalar_mul(baseLR2, tf.pow((1 - step_ph / num_steps), args.power))
    lr1 = baseLR1 / tf.pow(1 + 0.001 * step_ph / steps_per_epoch, 0.75)
    lr2 = baseLR2 / tf.pow(1 + 0.001 * step_ph / steps_per_epoch, 0.75)

    # lr1=baseLR1
    # lr2=baseLR2
    # decay_steps=steps_per_epoch*10
    # lr1=tf.train.exponential_decay(baseLR1,step_ph,decay_steps,0.1,staircase=True)
    # lr2=tf.train.exponential_decay(baseLR2,step_ph,decay_steps,0.1,staircase=True)
    keep_prob = tf.placeholder(dtype=tf.float32, shape=())
    # loss_balance =1- tf.scalar_mul(1., tf.pow((1 - step_ph / num_steps), args.power))
    loss_balance = tf.constant(1.)
    # boundaries = [np.float32(np.int32((8/10) * num_steps)), np.float32(np.int((9/10) * num_steps))]
    # values = [0., 0.1, 0.2]
    # loss_balance = tf.train.piecewise_constant(step_ph, boundaries, values)

    model = DeepCoralModel(args, keep_prob, src_img, src_label, tar_img,
                           tar_label)
    model.build_losses(loss_balance)  # loss_balance
    model.build_outputs()
    summary_ = model.build_summary()
    loss = model.loss

    # Gets moving_mean and moving_variance update operations from tf.GraphKeys.UPDATE_OPS
    if args.no_update_mean_var == True:
        update_ops = None
    else:
        print(toMagenta('updating mean and var in batchnorm'))
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    all_trainable_var = [v for v in tf.global_variables()]
    fine_tune_var = [v for v in all_trainable_var if 'fc8' not in v.name]
    fine_tune_var_weights = [v for v in fine_tune_var if 'weights' in v.name]
    fine_tune_var_bias = [v for v in fine_tune_var if 'bias' in v.name]
    retrain_var = [v for v in all_trainable_var if 'fc8' in v.name]
    retrain_var_weights = [v for v in retrain_var if 'weights' in v.name]
    retrain_var_bias = [v for v in retrain_var if 'bias' in v.name]

    with tf.control_dependencies(update_ops):
        opt1_1 = tf.train.MomentumOptimizer(lr1 * hvd.size(), args.momentum)
        opt1_1 = hvd.DistributedOptimizer(opt1_1)
        grads1_1 = tf.gradients(loss, fine_tune_var_weights)
        train_op_1_1 = opt1_1.apply_gradients(
            zip(grads1_1, fine_tune_var_weights))

        opt1_2 = tf.train.MomentumOptimizer(2 * lr1 * hvd.size(),
                                            args.momentum)
        opt1_2 = hvd.DistributedOptimizer(opt1_2)
        grads1_2 = tf.gradients(loss, fine_tune_var_bias)
        train_op_1_2 = opt1_2.apply_gradients(zip(grads1_2,
                                                  fine_tune_var_bias))

        opt2_1 = tf.train.MomentumOptimizer(lr2 * hvd.size(), args.momentum)
        opt2_1 = hvd.DistributedOptimizer(opt2_1)
        grads2_1 = tf.gradients(loss, retrain_var_weights)
        train_op_2_1 = opt2_1.apply_gradients(
            zip(grads2_1, retrain_var_weights))

        opt2_2 = tf.train.MomentumOptimizer(2 * lr2 * hvd.size(),
                                            args.momentum)
        opt2_2 = hvd.DistributedOptimizer(opt2_2)
        grads2_2 = tf.gradients(loss, retrain_var_bias)
        train_op_2_2 = opt2_2.apply_gradients(zip(grads2_2, retrain_var_bias))

        train_op = tf.group(train_op_1_1, train_op_1_2, train_op_2_1,
                            train_op_2_2)
    # Set up tf session and initialize variables.
    #
    config = tf.ConfigProto()  #Chong
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    init_local = tf.local_variables_initializer()
    init = tf.global_variables_initializer()

    # construct summary
    summary_.append(tf.summary.scalar('train/lr1', lr1))
    summary_.append(tf.summary.scalar('train/lr2', lr2))
    summary_.append(tf.summary.scalar('train/loss_balance', loss_balance))

    summary_merged = tf.summary.merge(summary_)
    if hvd.rank() == 0:
        FinalSummary = tf.summary.FileWriter(args.snapshot_dir, sess.graph)

    # init
    sess.run([init_local, init])
    bcast = hvd.broadcast_global_variables(0)
    sess.run(bcast)

    # Saver for storing checkpoints of the model.
    var = tf.global_variables()
    skip_var = ['fc8']
    saver = tf.train.Saver(var_list=var, max_to_keep=5)

    ckpt = tf.train.get_checkpoint_state(args.snapshot_dir)
    if ckpt and ckpt.model_checkpoint_path and args.resume:
        loader = tf.train.Saver(var_list=var)
        load_step = int(
            os.path.basename(ckpt.model_checkpoint_path).split('-')[1])
        load(loader, sess, ckpt.model_checkpoint_path)
    elif not args.not_load_pretrained:
        print(toRed('Restore from pre-trained model...' + args.restore_from))
        model.load_initial_weights(sess, args.restore_from,
                                   skip_var)  #Chong:0531

    # Start queue threads.
    threads = tf.train.start_queue_runners(coord=coord, sess=sess)

    # Iterate over training steps.
    acc2_history = 0
    for step in range(start_steps, num_steps):
        start_time = time.time()
        feed_dict = {step_ph: step, keep_prob: 0.5}
        summary, total_loss, _ = sess.run([summary_merged, loss, train_op],
                                          feed_dict=feed_dict)
        if hvd.rank() == 0:
            FinalSummary.add_summary(summary, step)
            duration = time.time() - start_time
            remain_time = duration * (num_steps - step) / 3600
            print(
                '\r',
                toCyan(
                    '{:s}:{:d}-{:d}-{:d} total loss = {:.3f},({:.3f} sec/step, ERT: {:.3f})'
                    .format(args.model_name + '-' + args.domain,
                            step % steps_per_epoch, step // steps_per_epoch,
                            args.num_epochs, total_loss, duration,
                            remain_time)),
                end='')

            if step % args.test_every == 0:
                acc1, acc2 = 0, 0
                for jj in range(val_num_steps):
                    feed_dict = {keep_prob: 1}
                    src_acc, tar_acc = sess.run([model.src_acc, model.tar_acc],
                                                feed_dict=feed_dict)
                    acc1 += np.sum(src_acc)
                    acc2 += np.sum(tar_acc)

                acc1 = acc1 / (val_num_steps * args.batch_size)
                acc2 = acc2 / (val_num_steps * args.batch_size)
                # pdb.set_trace()
                test_summary = tf.Summary()
                test_summary.value.add(tag='test/source_accuracy',
                                       simple_value=acc1)
                test_summary.value.add(tag='test/target_accuracy',
                                       simple_value=acc2)
                FinalSummary.add_summary(test_summary, step)

                if acc2 > acc2_history:
                    save(saver, sess, args.snapshot_dir, step)
                    acc2_history = acc2

    coord.request_stop()
    coord.join(threads)
    sess.close()
Exemple #27
0
    config.gpu_options.allow_growth = False
    config.gpu_options.visible_device_list = ''

if args.eager:
    tf.enable_eager_execution(config)

# Set up standard model.
model = getattr(applications, args.model)(weights=None)

opt = tf.train.GradientDescentOptimizer(0.01)

# Horovod: (optional) compression algorithm.
compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

# Horovod: wrap optimizer with DistributedOptimizer.
opt = hvd.DistributedOptimizer(opt, compression=compression)

init = tf.global_variables_initializer()
bcast_op = hvd.broadcast_global_variables(0)

data = tf.random_uniform([args.batch_size, 224, 224, 3])
target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64)


def loss_function():
    logits = model(data, training=True)
    return tf.losses.sparse_softmax_cross_entropy(target, logits)


def log(s, nl=True):
    if hvd.rank() != 0:
Exemple #28
0
def main(_):
    start_time = datetime.now()
    tf.logging.info("Starting at: {}".format(start_time))
    tf.logging.info("Batch size: {} images per step".format(FLAGS.batch_size))

    if not FLAGS.no_horovod:
        # Initialize Horovod.
        hvd.init()

    # Download MNIST dataset.
    mnist = input_data.read_data_sets(FLAGS.data_path, one_hot=False)

    # Input tensors
    with tf.name_scope("input"):
        image = tf.placeholder(tf.float32, [None, 784], name="image")
        label = tf.placeholder(tf.float32, [None], name="label")

    # Define model
    predict, loss, accuracy = get_model(image, label)

    if not FLAGS.no_horovod:
        # Horovod: adjust learning rate based on number workers
        opt = tf.train.RMSPropOptimizer(FLAGS.learning_rate * hvd.size())
    else:
        opt = tf.train.RMSPropOptimizer(FLAGS.learning_rate)

    # Wrap optimizer with Horovod Distributed Optimizer.
    if FLAGS.no_horovod is None:
        opt = hvd.DistributedOptimizer(opt)

    global_step = tf.train.get_or_create_global_step()
    train_op = opt.minimize(loss, global_step=global_step)

    if not FLAGS.no_horovod:
        last_step = FLAGS.total_steps // hvd.size()
    else:
        last_step = FLAGS.total_steps

    def formatter_log(tensors):
        if FLAGS.no_horovod:
            logstring = "Step {} of {}: " \
                        " training loss = {:.4f}," \
                        " training accuracy = {:.4f}".\
                        format(tensors["step"], last_step,
                               tensors["loss"], tensors["accuracy"])
        else:
            logstring = "HOROVOD (Worker #{}), Step {} of {}: " \
                        " training loss = {:.4f}," \
                        " training accuracy = {:.4f}".\
                format(hvd.rank(),
                       tensors["step"],
                       last_step,
                       tensors["loss"],
                       tensors["accuracy"])

        return logstring

    hooks = [
        tf.train.StopAtStepHook(last_step=last_step),

        # Prints the loss and step every log_steps steps
        tf.train.LoggingTensorHook(tensors={
            "step": global_step,
            "loss": loss,
            "accuracy": accuracy
        },
                                   every_n_iter=FLAGS.log_steps,
                                   formatter=formatter_log),
    ]

    # Horovod: BroadcastGlobalVariablesHook broadcasts
    # initial variable states from rank 0 to all other
    # processes. This is necessary to ensure consistent
    # initialization of all workers when training is
    # started with random weights
    # or restored from a checkpoint.
    if not FLAGS.no_horovod:
        hooks.append(hvd.BroadcastGlobalVariablesHook(0))

        # Horovod: save checkpoints only on
        # worker 0 to prevent other workers from
        # corrupting them.
        if hvd.rank() == 0:
            checkpoint_dir = "{}/{}-workers/{}".\
                format(FLAGS.output_path,
                       hvd.size(),
                       datetime.now().strftime("%Y%m%d-%H%M%S"))
        else:
            checkpoint_dir = None

    else:
        checkpoint_dir = "{}/no_hvd/{}".\
            format(FLAGS.output_path,
                   datetime.now().strftime("%Y%m%d-%H%M%S"))

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint,
    # and closing when done or an error occurs.
    with tf.train.\
            MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                     hooks=hooks,
                                     save_summaries_steps=FLAGS.log_steps,
                                     log_step_count_steps=FLAGS.log_steps,
                                     config=config) as mon_sess:
        while not mon_sess.should_stop():
            # Run a training step synchronously.
            image_, label_ = mnist.train.next_batch(FLAGS.batch_size)
            mon_sess.run(train_op, feed_dict={image: image_, label: label_})

    stop_time = datetime.now()
    tf.logging.info("Stopping at: {}".format(stop_time))
    tf.logging.info("Elapsed time was: {}".format(stop_time - start_time))
Exemple #29
0
def main(FLAGS):
    if FLAGS.hvd:
        hvd.init()
        if hvd.local_rank() == 0:
            tf.logging.set_verbosity(tf.logging.INFO)
            log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
            os.makedirs(FLAGS.results_dir, exist_ok=True)
            dllogger.init(backends=[
                dllogger.JSONStreamBackend(
                    verbosity=dllogger.Verbosity.VERBOSE, filename=log_path),
                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
            ])
        else:
            tf.logging.set_verbosity(tf.logging.ERROR)
            dllogger.init(backends=[])
        num_gpus = hvd.size()
    else:
        tf.logging.set_verbosity(tf.logging.INFO)
        log_path = os.path.join(FLAGS.results_dir, FLAGS.log_filename)
        os.makedirs(FLAGS.results_dir, exist_ok=True)
        dllogger.init(backends=[
            dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                       filename=log_path),
            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE)
        ])
        num_gpus = 1

    dllogger.log(data=vars(FLAGS), step='PARAMETER')

    create_batches = FLAGS.batch_size // FLAGS.prebatch_size

    wide_columns, deep_columns = get_feature_columns(
        use_all_columns=FLAGS.use_all_columns)
    tf_transform_output = tft.TFTransformOutput(
        FLAGS.transformed_metadata_path)

    if not FLAGS.hvd or hvd.local_rank() == 0:
        tf.compat.v1.logging.warn('command line arguments: {}'.format(
            json.dumps(vars(FLAGS))))
        if not os.path.exists(FLAGS.results_dir):
            os.mkdir(FLAGS.results_dir)

        with open('{}/args.json'.format(FLAGS.results_dir), 'w') as f:
            json.dump(vars(FLAGS), f, indent=4)

    if FLAGS.gpu:
        session_config = tf.compat.v1.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)
    else:
        session_config = tf.compat.v1.ConfigProto(
            device_count={'GPU': 0},
            log_device_placement=FLAGS.log_device_placement)

    if FLAGS.hvd:
        session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    if FLAGS.xla:
        session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

    if FLAGS.benchmark:
        model_dir = None
    else:
        model_dir = FLAGS.model_dir

    if FLAGS.save_checkpoints_steps != 0:
        run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(
            session_config=session_config,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps,
            keep_checkpoint_max=1)
    else:
        run_config = tf.estimator.RunConfig(model_dir=model_dir).replace(
            session_config=session_config,
            save_checkpoints_secs=FLAGS.save_checkpoints_secs,
            keep_checkpoint_max=1)

    wide_optimizer = tf.compat.v1.train.FtrlOptimizer(
        learning_rate=FLAGS.linear_learning_rate,
        l1_regularization_strength=FLAGS.linear_l1_regularization,
        l2_regularization_strength=FLAGS.linear_l2_regularization)

    deep_optimizer = tf.compat.v1.train.ProximalAdagradOptimizer(
        learning_rate=FLAGS.deep_learning_rate,
        initial_accumulator_value=0.1,
        l1_regularization_strength=FLAGS.deep_l1_regularization,
        l2_regularization_strength=FLAGS.deep_l2_regularization,
        use_locking=False)

    if FLAGS.hvd:
        wide_optimizer = hvd.DistributedOptimizer(wide_optimizer)
        deep_optimizer = hvd.DistributedOptimizer(deep_optimizer)

    stats_filename = os.path.join(FLAGS.transformed_metadata_path,
                                  'stats.json')
    embed_columns = None

    # input functions to read data from disk
    train_input_fn = lambda: separate_input_fn(
        tf_transform_output,
        FLAGS.train_data_pattern,
        create_batches,
        tf.estimator.ModeKeys.TRAIN,
        reader_num_threads=FLAGS.reader_num_threads,
        parser_num_threads=FLAGS.parser_num_threads,
        shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches),
        prefetch_buffer_size=FLAGS.prefetch_buffer_size,
        print_display_ids=FLAGS.print_display_ids)
    eval_input_fn = lambda: separate_input_fn(
        tf_transform_output,
        FLAGS.eval_data_pattern,
        (FLAGS.eval_batch_size // FLAGS.prebatch_size),
        tf.estimator.ModeKeys.EVAL,
        reader_num_threads=1,
        parser_num_threads=1,
        shuffle_buffer_size=int(FLAGS.shuffle_percentage * create_batches),
        prefetch_buffer_size=FLAGS.prefetch_buffer_size,
        print_display_ids=FLAGS.print_display_ids)

    estimator = construct_estimator(FLAGS.model_type,
                                    not FLAGS.canned_estimator,
                                    run_config,
                                    wide_columns,
                                    wide_optimizer,
                                    deep_columns,
                                    FLAGS.deep_hidden_units,
                                    FLAGS.deep_dropout,
                                    deep_optimizer,
                                    amp=FLAGS.amp)

    estimator = tf.estimator.add_metrics(estimator, map_custom_metric)
    estimator = tf.estimator.add_metrics(estimator,
                                         map_custom_metric_with_leak)

    steps_per_epoch = FLAGS.training_set_size / FLAGS.batch_size

    print('Steps per epoch: {}'.format(steps_per_epoch))
    max_steps = int(FLAGS.num_epochs * steps_per_epoch)

    hooks = []
    if FLAGS.hvd:
        hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if FLAGS.predict or FLAGS.evaluate:  # inference
        if FLAGS.benchmark:
            benchmark_hook = BenchmarkLoggingHook(
                global_batch_size=num_gpus * FLAGS.eval_batch_size,
                warmup_steps=FLAGS.benchmark_warmup_steps)
            hooks.append(benchmark_hook)
            eval_steps = FLAGS.benchmark_steps
        else:
            eval_steps = FLAGS.eval_steps

        predict_result_iter = estimator.predict(input_fn=eval_input_fn,
                                                hooks=hooks,
                                                yield_single_examples=False)

        results = []
        for i, r in enumerate(predict_result_iter):
            print('predicting batch: ', i)
            results.append(r)
            # TODO: use eval_steps
            if i >= eval_steps - 1:
                break

        if FLAGS.benchmark:
            infer_throughput = benchmark_hook.mean_throughput.value()

        if FLAGS.benchmark:
            dllogger.log(data={'infer_throughput': infer_throughput},
                         step=tuple())
        elif FLAGS.evaluate:
            print(
                'evaluating using estimator.evaluate with eval_batch_size = ',
                FLAGS.eval_batch_size, ' and eval_steps = ', FLAGS.eval_steps)

            result = estimator.evaluate(eval_input_fn,
                                        hooks=hooks,
                                        steps=FLAGS.eval_steps)
            dllogger.log(step=(),
                         data={
                             'map_infer': float(result['map']),
                             'map_with_leak_infer':
                             float(result['map_with_leak'])
                         })
        elif FLAGS.predict:
            scores = [r['probabilities'][:, 1] for r in results]
            scores = np.hstack(scores)
            scores_path = os.path.join(FLAGS.model_dir, 'scores.txt')
            print('saving the numpy scores array to: ', scores_path)
            np.savetxt(scores_path, scores, fmt="%f", delimiter='\n')

    else:  # training

        if FLAGS.benchmark:
            benchmark_hook = BenchmarkLoggingHook(
                global_batch_size=num_gpus * FLAGS.batch_size,
                warmup_steps=FLAGS.benchmark_warmup_steps)
            hooks.append(benchmark_hook)
            estimator.train(train_input_fn,
                            hooks=hooks,
                            steps=FLAGS.benchmark_steps)
            train_throughput = benchmark_hook.mean_throughput.value()
            dllogger.log(data={'train_throughput': train_throughput},
                         step=tuple())
        else:
            train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                                max_steps=max_steps,
                                                hooks=hooks)
            eval_spec = tf.estimator.EvalSpec(
                input_fn=eval_input_fn,
                throttle_secs=FLAGS.eval_throttle_secs,
                steps=FLAGS.eval_steps)
            result = tf.estimator.train_and_evaluate(estimator, train_spec,
                                                     eval_spec)

            if result:
                dllogger.log(step=(),
                             data={
                                 'map': float(result[0]['map']),
                                 'map_with_leak':
                                 float(result[0]['map_with_leak'])
                             })
Exemple #30
0
    def _set_train_or_infer(self, res, reverse_target_vocab_table, hparams):
        """Set up training and inference."""
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.train_loss = res[1]
            self.word_count = tf.reduce_sum(
                self.iterator.source_sequence_length) + tf.reduce_sum(
                    self.iterator.target_sequence_length)
        elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
            self.eval_loss = res[1]
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_logits, _, self.final_context_state, self.sample_id = res
            self.sample_words = reverse_target_vocab_table.lookup(
                tf.to_int64(self.sample_id))

        if self.mode != tf.contrib.learn.ModeKeys.INFER:
            ## Count the number of predicted words for compute ppl.
            self.predict_count = tf.reduce_sum(
                self.iterator.target_sequence_length)

        params = tf.trainable_variables()

        # Gradients and SGD update operation for training the model.
        # Arrange for the embedding vars to appear at the beginning.
        if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
            self.learning_rate = tf.constant(hparams.learning_rate)
            # warm-up
            self.learning_rate = self._get_learning_rate_warmup(hparams)
            # decay
            self.learning_rate = self._get_learning_rate_decay(hparams)

            # Optimizer
            if hparams.optimizer == "sgd":
                opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            elif hparams.optimizer == "adam":
                opt = tf.train.AdamOptimizer(self.learning_rate)
            else:
                raise ValueError("Unknown optimizer type %s" %
                                 hparams.optimizer)

            # Add Horovod Distributed Optimizer
            opt = hvd.DistributedOptimizer(opt)

            # Gradients
            #gradients = tf.gradients(
            #    self.train_loss,
            #    params,
            #    colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)

            # Horovod compute_gradients
            # Allreduce the gradients before returning them
            gradients, variables = zip(
                *opt.compute_gradients(self.train_loss,
                                       params,
                                       colocate_gradients_with_ops=hparams.
                                       colocate_gradients_with_ops))

            clipped_grads, grad_norm_summary, grad_norm = model_helper.gradient_clip(
                gradients, max_gradient_norm=hparams.max_gradient_norm)
            self.grad_norm_summary = grad_norm_summary
            self.grad_norm = grad_norm

            self.update = opt.apply_gradients(zip(clipped_grads, params),
                                              global_step=self.global_step)

            # Summary
            self.train_summary = self._get_train_summary()
        elif self.mode == tf.contrib.learn.ModeKeys.INFER:
            self.infer_summary = self._get_infer_summary(hparams)

        # Print trainable variables
        utils.print_out("# Trainable variables")
        utils.print_out("Format: <name>, <shape>, <(soft) device placement>")
        for param in params:
            utils.print_out(
                "  %s, %s, %s" %
                (param.name, str(param.get_shape()), param.op.device))