def run_eval():
    """Evaluate on test or validation."""
    with tf.Graph().as_default():
        # Input images and labels.
        features = get_features(False, 5)
        model = f_model.multi_gpu_model
        result = model(features)
        merged = result['summary']
        correct_prediction_sum = result['correct']
        almost_correct_sum = result['almost']
        saver = tf.train.Saver()
        test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')
        seen_step = -1
        time.sleep(3 * 60)
        paused = 0
        while paused < 360:
            ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir + '/train/')
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoin
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]
            else:
                time.sleep(2 * 60)
                paused += 2
                continue
            while seen_step == int(global_step):
                time.sleep(2 * 60)
                ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir +
                                                     '/train/')
                global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                    '-')[-1]
                paused += 2
                if paused > 360:
                    test_writer.close()
                    return
            paused = 0

            seen_step = int(global_step)
            print(seen_step)
            sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            saver.restore(sess, ckpt.model_checkpoint_path)
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            try:
                total_tp = 0
                total_almost = 0
                for i in range(FLAGS.eval_size // 5):
                    summary_j, tp, almost = sess.run(
                        [merged, correct_prediction_sum, almost_correct_sum])
                    total_tp += tp
                    total_almost += almost

                total_false = FLAGS.eval_size - total_tp
                total_almost_false = FLAGS.eval_size - total_almost
                summary_tp = tf.Summary.FromString(summary_j)
                summary_tp.value.add(tag='correct_prediction',
                                     simple_value=total_tp)
                summary_tp.value.add(tag='wrong_prediction',
                                     simple_value=total_false)
                summary_tp.value.add(tag='almost_wrong_prediction',
                                     simple_value=total_almost_false)
                test_writer.add_summary(summary_tp, global_step)
                print('write done')
            except tf.errors.OutOfRangeError:
                print('Done eval for %d steps.' % i)
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()
            # Wait for threads to finish.
            coord.join(threads)
            sess.close()
        test_writer.close()
Beispiel #2
0
 def _build_session(self):
     sess_config = tf.ConfigProto()
     if self.use_xla:
         sess_config.graph_options.optimizer_options.global_jit_level = (
             tf.OptimizerOptions.ON_2)
     return tf.Session(config=sess_config)
Beispiel #3
0
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
random.seed(1111)
np.random.seed(1111)
tf.set_random_seed(1111)

train_batch_size = 128
test_batch_size = 128
predict_batch_size = 1
predict_users_num = 100
predict_ads_num = 99

info = pkl.load(open('ali_test_info_4days.pkl', 'rb'))

tf.reset_default_graph()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
model = Model(info[0], info[1], info[2], info[3], predict_batch_size,
              predict_ads_num)
sess.run(tf.global_variables_initializer())
model.restore_(sess, './save_path_alibaba_new/ckpt')

knn_key = pkl.load(open('knn_table/ali_knn_key.pkl', 'rb'))

mypath = './test_data'
files = listdir(mypath)
csv_list = []
for f in files:
    fullpath = join(mypath, f)
    if isfile(fullpath):
Beispiel #4
0
import facenet
import detect_face
import os
import time
import pickle
import sys

img_path = 'abc.jpg'
modeldir = './model/20170511-185253.pb'
classifier_filename = './class/classifier.pkl'
npy = './npy'
train_img = "./train_img"

with tf.Graph().as_default():
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                            log_device_placement=False))
    with sess.as_default():
        pnet, rnet, onet = detect_face.create_mtcnn(sess, npy)

        minsize = 20  # minimum size of face
        threshold = [0.6, 0.7, 0.7]  # three steps's threshold
        factor = 0.709  # scale factor
        margin = 44
        frame_interval = 3
        batch_size = 1000
        image_size = 182
        input_image_size = 160

        HumanNames = os.listdir(train_img)
        HumanNames.sort()
Beispiel #5
0
def main(_):

    # If using update_damping_immediately resource variables must be enabled.
    # Would recommend always enabling them anyway.
    if FLAGS.update_damping_immediately:
        tf.enable_resource_variables()

    if FLAGS.use_control_flow_v2:
        tf.enable_control_flow_v2()

    if not FLAGS.auto_register_layers and FLAGS.use_keras_model:
        raise ValueError('Require auto_register_layers=True when using Keras '
                         'model.')

    tf.set_random_seed(FLAGS.seed)
    (train_op, opt, batch_loss, batch_error, batch_size_schedule,
     batch_size) = construct_train_quants()

    global_step = tf.train.get_or_create_global_step()

    if FLAGS.optimizer == 'kfac':
        # We need to put the control depenency on train_op here so that we are
        # guaranteed to get the up-to-date values of these various quantities.
        # Otherwise there is a race condition and we might get the old values,
        # nondeterministically. Another solution would be to get these values in
        # a separate sess.run call, but this can sometimes cause problems with
        # training frameworks that use hooks (see the comments below).
        with tf.control_dependencies([train_op]):
            learning_rate = opt.learning_rate
            momentum = opt.momentum
            damping = opt.damping
            rho = opt.rho
            qmodel_change = opt.qmodel_change

    # Without setting allow_soft_placement=True there will be problems when
    # the optimizer tries to place certain ops like "mod" on the GPU (which isn't
    # supported).
    config = tf.ConfigProto(allow_soft_placement=True)

    # It's good practice to put everything into a single sess.run call. The
    # reason is that certain "training frameworks" like to run hooks at each
    # sess.run call, and there is an implicit expectation there will only
    # be one sess.run call every "iteration" of the "optimizer". For example,
    # a framework might try to print the loss at each sess.run call, causing
    # the mini-batch to be advanced, thus completely breaking the "cached
    # batch" mechanism that the damping adaptation method may rely on. (Plus
    # there will also be the extra cost of having to reevaluate the loss
    # twice.)  That being said we don't completely do that here because it's
    # inconvenient.

    # Train model.
    with tf.train.MonitoredTrainingSession(save_checkpoint_secs=30,
                                           config=config) as sess:
        for _ in range(FLAGS.train_steps):
            i = sess.run(global_step)

            if FLAGS.use_batch_size_schedule:
                batch_size_ = batch_size_schedule[min(
                    i,
                    len(batch_size_schedule) - 1)]
            else:
                batch_size_ = FLAGS.batch_size

            if FLAGS.optimizer == 'kfac':
                (_, batch_loss_, batch_error_, learning_rate_, momentum_,
                 damping_, rho_, qmodel_change_) = sess.run(
                     [
                         train_op, batch_loss, batch_error, learning_rate,
                         momentum, damping, rho, qmodel_change
                     ],
                     feed_dict={batch_size: batch_size_})
            else:
                _, batch_loss_, batch_error_ = sess.run(
                    [train_op, batch_loss, batch_error],
                    feed_dict={batch_size: batch_size_})

            # Print training stats.
            tf.logging.info('iteration: %d', i)
            tf.logging.info(
                'mini-batch size: %d | mini-batch loss = %f | mini-batch error = %f ',
                batch_size_, batch_loss_, batch_error_)

            if FLAGS.optimizer == 'kfac':
                tf.logging.info('learning_rate = %f | momentum = %f',
                                learning_rate_, momentum_)
                tf.logging.info('damping = %f | rho = %f | qmodel_change = %f',
                                damping_, rho_, qmodel_change_)

            tf.logging.info('----')
Beispiel #6
0
def main(_):

      
    mesh_shape = [("row", 2), ("col", 2)]
    layout_rules = [("nx_lr", "row"), ("ny_lr", "col"),
                    ("nx", "row"), ("ny", "col"),
                    ("ty_lr", "row"), ("tz_lr", "col"),
                    ("nx_block","row"), ("ny_block","col")]

                    
    mesh_hosts = ["localhost:%d"%(8222+j) for j in range(4)]
    
    # Create a cluster from the mesh hosts.                                                                                                                                         
    cluster = tf.train.ClusterSpec({"mesh": mesh_hosts, "master":["localhost:8488"]})

    # Create a server for local mesh members                                                                                                                                        
    server = tf.train.Server(cluster,
                       job_name="master",
                       task_index=0)

    mesh_devices = ['/job:mesh/task:%d'%i for i in range(cluster.num_tasks("mesh"))]
    print("List of devices", mesh_devices)
    mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(mesh_shape, layout_rules, mesh_devices)


    # Build the model
    
    # Create computational graphs and some initializations

    graph = mtf.Graph()
    mesh = mtf.Mesh(graph, "nbody_mesh")

    # Compute a few things first, using simple tensorflow
    a0=FLAGS.a0
    a=FLAGS.af
    nsteps=FLAGS.nsteps
    bs, nc = FLAGS.box_size, FLAGS.nc
    klin = np.loadtxt('../flowpm/data/Planck15_a1p00.txt').T[0]
    plin = np.loadtxt('../flowpm/data/Planck15_a1p00.txt').T[1]
    ipklin = iuspline(klin, plin)
    stages = np.linspace(a0, a, nsteps, endpoint=True)

    #pt = PerturbationGrowth(cosmology, a=[a], a_normalize=1.0)
    # Generate a batch of 3D initial conditions
    initial_conditions = flowpm.linear_field(FLAGS.nc,          # size of the cube
                                             FLAGS.box_size,         # Physical size of the cube
                                             ipklin,      # Initial power spectrum
                                             batch_size=FLAGS.batch_size)

    state = lpt_init(initial_conditions, a0=a0, order=1) 
    final_state = state#nbody(state,  stages, nc)
    tfinal_field = cic_paint(tf.zeros_like(initial_conditions), final_state[0])

    # Compute necessary Fourier kernels
    kvec = flowpm.kernels.fftk((nc, nc, nc), symmetric=False)
    from flowpm.kernels import laplace_kernel, gradient_kernel
    lap = tf.cast(laplace_kernel(kvec), tf.complex64)
    grad_x = gradient_kernel(kvec, 0)
    grad_y = gradient_kernel(kvec, 1)
    grad_z = gradient_kernel(kvec, 2)
    derivs = [lap, grad_x, grad_y, grad_z]
    
    mesh_final_field = lpt_prototype(mesh, initial_conditions, derivs,
                                     bs = FLAGS.box_size,
                                     nc=FLAGS.nc,batch_size=FLAGS.batch_size)
    # Lower mesh computation
    lowering = mtf.Lowering(graph, {mesh:mesh_impl})

    # Retrieve output of computation
    result = lowering.export_to_tf_tensor(mesh_final_field)

    with tf.Session(server.target, config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False)) as sess:
        a,b,c = sess.run([initial_conditions, tfinal_field, result])
    np.save('init', a)
    np.save('reference_final', b)
    np.save('mesh_pyramid', c)
    
    
    plt.figure(figsize=(15,3))
    plt.subplot(141)
    plt.imshow(a[0].sum(axis=2))
    plt.title('Initial Conditions')

    plt.subplot(142)
    plt.imshow(b[0].sum(axis=2))
    plt.title('TensorFlow (single GPU)')
    plt.colorbar()

    plt.subplot(143)
    plt.imshow(c[0].sum(axis=2))
    plt.title('Mesh TensorFlow')
    plt.colorbar()

    plt.subplot(144)
    plt.imshow((b[0] - c[0]).sum(axis=2))
    plt.title('Residuals')
    plt.colorbar()

    plt.savefig("comparison.png")

    exit(0)
Beispiel #7
0
def train(flags):
    """Model training."""

    flags.training = True

    # Set the verbosity based on flags (default is INFO, so we see all messages)
    logging.set_verbosity(flags.verbosity)

    # Start a new TensorFlow session.
    tf.reset_default_graph()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    tf.keras.backend.set_session(sess)

    audio_processor = input_data.AudioProcessor(flags)

    time_shift_samples = int((flags.time_shift_ms * flags.sample_rate) / 1000)

    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, flags.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, flags.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))
    logging.info(flags)
    model = models.MODELS[flags.model_name](flags)
    logging.info(model.summary())

    # save model summary
    utils.save_model_summary(model, flags.train_dir)

    # save model and data flags
    with open(os.path.join(flags.train_dir, 'flags.txt'), 'wt') as f:
        pprint.pprint(flags, stream=f)

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam(epsilon=flags.optimizer_epsilon)

    if flags.optimizer == 'adam':
        optimizer = tf.keras.optimizers.Adam(epsilon=flags.optimizer_epsilon)
    elif flags.optimizer == 'momentum':
        optimizer = tf.keras.optimizers.SGD(momentum=0.9)
    elif flags.optimizer == 'novograd':
        optimizer = tfa.optimizers.NovoGrad(
            lr=0.05,
            beta_1=flags.novograd_beta_1,
            beta_2=flags.novograd_beta_2,
            weight_decay=flags.novograd_weight_decay,
            grad_averaging=bool(flags.novograd_grad_averaging))
    else:
        raise ValueError('Unsupported optimizer:%s' % flags.optimizer)

    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    train_writer = tf.summary.FileWriter(flags.summaries_dir + '/train',
                                         sess.graph)
    validation_writer = tf.summary.FileWriter(flags.summaries_dir +
                                              '/validation')

    sess.run(tf.global_variables_initializer())

    start_step = 1

    logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.train.write_graph(sess.graph_def, flags.train_dir, 'graph.pbtxt')

    # Save list of words.
    with tf.io.gfile.GFile(os.path.join(flags.train_dir, 'labels.txt'),
                           'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    best_accuracy = 0.0

    # prepare parameters for exp learning rate decay
    training_steps_max = np.sum(training_steps_list)
    lr_init = learning_rates_list[0]
    exp_rate = -np.log(learning_rates_list[-1] / lr_init) / training_steps_max

    # Training loop.
    for training_step in range(start_step, training_steps_max + 1):
        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            flags.batch_size, 0, flags, flags.background_frequency,
            flags.background_volume, time_shift_samples, 'training',
            flags.resample, flags.volume_resample, sess)

        if flags.lr_schedule == 'exp':
            learning_rate_value = lr_init * np.exp(-exp_rate * training_step)
        elif flags.lr_schedule == 'linear':
            # Figure out what the current learning rate is.
            training_steps_sum = 0
            for i in range(len(training_steps_list)):
                training_steps_sum += training_steps_list[i]
                if training_step <= training_steps_sum:
                    learning_rate_value = learning_rates_list[i]
                    break
        else:
            raise ValueError('Wrong lr_schedule: %s' % flags.lr_schedule)

        tf.keras.backend.set_value(model.optimizer.lr, learning_rate_value)
        result = model.train_on_batch(train_fingerprints, train_ground_truth)

        summary = tf.Summary(value=[
            tf.Summary.Value(tag='accuracy', simple_value=result[1]),
        ])
        train_writer.add_summary(summary, training_step)

        logging.info(
            'Step #%d: rate %f, accuracy %.2f%%, cross entropy %f',
            *(training_step, learning_rate_value, result[1] * 100, result[0]))

        is_last_step = (training_step == training_steps_max)
        if (training_step % flags.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            set_size = int(set_size / flags.batch_size) * flags.batch_size
            total_accuracy = 0.0
            count = 0.0
            for i in range(0, set_size, flags.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(flags.batch_size, i, flags, 0.0,
                                             0.0, 0, 'validation', 0.0, 0.0,
                                             sess))

                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                result = model.test_on_batch(validation_fingerprints,
                                             validation_ground_truth)

                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='accuracy', simple_value=result[1]),
                ])

                validation_writer.add_summary(summary, training_step)

                total_accuracy += result[1]
                count = count + 1.0

            total_accuracy = total_accuracy / count
            logging.info('Step %d: Validation accuracy = %.2f%% (N=%d)',
                         *(training_step, total_accuracy * 100, set_size))

            model.save_weights(flags.train_dir + 'train/' +
                               str(int(best_accuracy * 10000)) + 'weights_' +
                               str(training_step))

            # Save the model checkpoint when validation accuracy improves
            if total_accuracy >= best_accuracy:
                best_accuracy = total_accuracy
                # overwrite the best model weights
                model.save_weights(flags.train_dir + 'best_weights')
            logging.info('So far the best validation accuracy is %.2f%%',
                         (best_accuracy * 100))

    tf.keras.backend.set_learning_phase(0)
    set_size = audio_processor.set_size('testing')
    set_size = int(set_size / flags.batch_size) * flags.batch_size
    logging.info('set_size=%d', set_size)
    total_accuracy = 0.0
    count = 0.0

    for i in range(0, set_size, flags.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            flags.batch_size, i, flags, 0.0, 0.0, 0, 'testing', 0.0, 0.0, sess)

        result = model.test_on_batch(test_fingerprints, test_ground_truth)

        total_accuracy += result[1]
        count = count + 1.0
    total_accuracy = total_accuracy / count

    logging.info('Final test accuracy = %.2f%% (N=%d)',
                 *(total_accuracy * 100, set_size))
    with open(os.path.join(flags.train_dir, 'accuracy_last.txt'), 'wt') as fd:
        fd.write(str(total_accuracy * 100))
    model.save_weights(flags.train_dir + 'last_weights')
Beispiel #8
0
def main(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    #print ("args: ", args)
    exps = pd.read_csv('exp.csv')
    for i, row in exps.iterrows():
        gc.collect()
        args['expname'] = row['name']
        args['sessionid'] = row['sessionid']
        args['itemid'] = row['itemid']
        args['data_folder'] = row['path']
        args['valid_data'] = row['test']
        args['train_data'] = row['train']
        args['freq'] = row['freq']

        print('Train:', args['train_data'], ' -- Test:', args['valid_data'],
              ' -- Freq:', args['freq'])
        with open("LOGGER_" + args['expname'] + ".txt", "a") as myfile:
            myfile.write(row['train'] + ", " + row['test'] + "\n")

        # split patterns to train_patterns and test_patterns
        print('Start Data Preprocessing: Training Set')
        train, itemsIDs, freqs, old_new = load_sequence(
            args['data_folder'] + '/' + args['train_data'],
            args['itemid'],
            args['sessionid'],
            itemsIDs=[])
        args['n_items'] = len(itemsIDs) + 1
        print('Start Data Preprocessing: Testing Set')
        valid, _, _, _ = load_sequence(args['data_folder'] + '/' +
                                       args['valid_data'],
                                       args['itemid'],
                                       args['sessionid'],
                                       Train=False,
                                       itemsIDs=itemsIDs,
                                       freq=args['freq'],
                                       old_new=old_new)

        #train, valid, test = data_process.load_data()
        print("%d train examples." % len(train[0]))
        print("%d valid examples." % len(valid[0]))
        keep_probability = np.array(args['keep_probability'])
        no_dropout = np.array(args['no_dropout'])
        result_path = "./save/" + args['dataset']
        # Build model
        tf.reset_default_graph()
        with tf.Session(config=config) as sess:
            model = CSRM(
                sess=sess,
                n_items=args['n_items'],
                dim_proj=int(args['dim_proj']),
                hidden_units=int(args['hidden_units']),
                memory_size=args['memory_size'],
                memory_dim=args['memory_dim'],
                shift_range=args['shift_range'],
                lr=args['lr'],
                controller_layer_numbers=args['controller_layer_numbers'],
                batch_size=args['batch_size'],
                epoch=args['epoch'],
                keep_probability=keep_probability,
                no_dropout=no_dropout,
                display_frequency=args['display_frequency'],
                item_freqs=freqs,
                expname=args['expname'])
            hit, MRR, cov, pop, train_time, test_time = model.train(
                train, valid, valid, result_path)

        print("#########################################################")
        print("NEW_LOGGER_ " + args['expname'])
        print(
            str(hit[0]) + ',' + str(hit[1]) + ',' + str(hit[2]) + ',' +
            str(hit[3]) + ',' + str(hit[4]) + ',' + str(MRR[0]) + ',' +
            str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' +
            str(MRR[4]))
        print("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' + str(cov[2]) +
              ',' + str(cov[3]) + ',' + str(cov[4]))
        print("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' + str(pop[2]) +
              ',' + str(pop[3]) + ',' + str(pop[4]))
        print("\nTrainTime:" + str(train_time))
        print("\nTestTime:" + str(test_time))

        with open("NEW_LOGGER_" + args['expname'] + ".txt", "a") as myfile:
            myfile.write(
                str(hit[0]) + ',' + str(hit[1]) + ',' + str(hit[2]) + ',' +
                str(hit[3]) + ',' + str(hit[4]) + ',' + str(MRR[0]) + ',' +
                str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' +
                str(MRR[4]))
            myfile.write("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' +
                         str(cov[2]) + ',' + str(cov[3]) + ',' + str(cov[4]))
            myfile.write("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' +
                         str(pop[2]) + ',' + str(pop[3]) + ',' + str(pop[4]))
            myfile.write("\nTrainTime:" + str(train_time))
            myfile.write("\nTestTime:" + str(test_time))
            myfile.write("\n############################################\n")
Beispiel #9
0
    def __init__(self,
                 model_fn,
                 params,
                 tpu_cluster_resolver=None,
                 keep_checkpoint_max=5):
        self._model_dir = params.model_dir
        self._params = params
        self._tpu_job_name = params.tpu_job_name
        self._evaluator = None
        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._keep_checkpoint_max = keep_checkpoint_max

        input_partition_dims = None
        num_cores_per_replica = None

        if params.use_tpu or self._tpu_cluster_resolver:
            if not self._tpu_cluster_resolver:
                self._tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                    params.platform.tpu,
                    zone=params.platform.tpu_zone,
                    project=params.platform.gcp_project)
            tpu_grpc_url = self._tpu_cluster_resolver.get_master()
            tf.Session.reset(tpu_grpc_url)

            # If the input image is transposed (from NHWC to HWCN), the partition
            # dimensions also need to be transposed the same way.
            def _maybe_transpose(input_partition_dims):
                if input_partition_dims and params.train.transpose_input:
                    return [input_partition_dims[i] for i in [1, 2, 3, 0]]
                else:
                    return input_partition_dims

            if params.train.input_partition_dims is not None:
                num_cores_per_replica = params.train.num_cores_per_replica
                input_partition_dims = params.train.input_partition_dims
                # Parse 'None' into None.
                input_partition_dims = [
                    None if x == 'None' else _maybe_transpose(x)
                    for x in input_partition_dims
                ]

            # Sets up config for TPUEstimator.
            tpu_config = tf.estimator.tpu.TPUConfig(
                params.train.iterations_per_loop,
                num_cores_per_replica=num_cores_per_replica,
                input_partition_dims=input_partition_dims,
                tpu_job_name=self._tpu_job_name,
                per_host_input_for_training=tf.estimator.tpu.
                InputPipelineConfig.PER_HOST_V2  # pylint: disable=line-too-long
            )

            run_config = tf.estimator.tpu.RunConfig(
                session_config=tf.ConfigProto(
                    isolate_session_state=params.isolate_session_state),
                cluster=self._tpu_cluster_resolver,
                evaluation_master=params.platform.eval_master,
                model_dir=params.model_dir,
                log_step_count_steps=params.train.iterations_per_loop,
                tpu_config=tpu_config,
                keep_checkpoint_max=self._keep_checkpoint_max,
            )
            self._estimator = tf.estimator.tpu.TPUEstimator(
                model_fn=model_fn,
                use_tpu=params.use_tpu,
                train_batch_size=params.train.train_batch_size,
                eval_batch_size=params.eval.eval_batch_size,
                predict_batch_size=params.predict.predict_batch_size,
                config=run_config,
                params=params.as_dict())
        else:
            model_params = params.as_dict()

            # Uses `train_batch_size` as the `batch_size` for GPU train.
            model_params.update({'batch_size': params.train.train_batch_size})

            gpu_devices = tf.config.experimental.list_physical_devices('GPU')
            tf.logging.info('gpu devices: %s', gpu_devices)
            devices = [
                'device:GPU:{}'.format(i) for i in range(len(gpu_devices))
            ]
            strategy = tf.distribute.MirroredStrategy(devices=devices)
            tf.logging.info('Number of devices: %s',
                            strategy.num_replicas_in_sync)
            run_config = tf.estimator.RunConfig(train_distribute=strategy,
                                                model_dir=params.model_dir)
            self._estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                     config=run_config,
                                                     params=model_params)
Beispiel #10
0
def main(args):
    # Build nominal dataset
    classes = cfg.ml_classes + [
        n + '_ss' for n in cfg.ml_classes if n not in ['ggh', 'qqh']
    ] + ['data_ss']
    x, y, w = build_dataset(os.path.join(args.workdir,
                                         'fold{}.root'.format(args.fold)),
                            classes,
                            args.fold,
                            use_class_weights=False,
                            make_categorical=False)
    x_train, x_val, y_train, y_val, w_train, w_val = train_test_split(
        x, y, w, test_size=0.25, random_state=1234)
    logger.info(
        'Number of train/val events in nominal dataset: {} / {}'.format(
            x_train.shape[0], x_val.shape[0]))

    # Scale to expectation in the full dataset
    scale_train = 4.0 / 3.0 * 2.0  # train/test split + two fold
    scale_val = 4.0 * 2.0
    w_train = w_train * scale_train
    w_val = w_val * scale_val
    for i, name in enumerate(classes):
        s_train = np.sum(w_train[y_train == i])
        s_val = np.sum(w_val[y_val == i])
        logger.debug('Class / train / val: {} / {} / {}'.format(
            name, s_train, s_val))

    # Build dataset for systematic shifts
    """
    x_sys, y_sys, w_sys = build_dataset(os.path.join(args.workdir, 'fold{}.root'.format(args.fold)),
            ['htt', 'htt_jecUncRelativeSampleYearUp', 'htt_jecUncRelativeSampleYearDown'], args.fold,
            make_categorical=False, use_class_weights=True)
    x_sys_train, x_sys_val, w_sys_train, w_sys_val = train_test_split(x_sys, w_sys, test_size=0.25, random_state=1234)
    logger.info('Number of train/val events in varied datasets: {} / {}'.format(x_sys_train.shape[0], x_sys_val.shape[0]))
    logger.debug('Sum of weights for nominal/up/down: {} / {} / {}'.format(
        np.sum(w_sys[y_sys == 0]), np.sum(w_sys[y_sys == 1]), np.sum(w_sys[y_sys == 2])))
    """

    # Preprocessing
    preproc = StandardScaler()
    preproc.fit(x_train)
    pickle.dump(
        preproc,
        open(
            os.path.join(args.workdir,
                         'preproc_fold{}.pickle'.format(args.fold)), 'wb'))
    x_train_preproc = preproc.transform(x_train)
    x_val_preproc = preproc.transform(x_val)
    for i, (var, mean, std) in enumerate(
            zip(cfg.ml_variables, preproc.mean_, preproc.scale_)):
        logger.info('Variable: %s', var)
        logger.info('Preprocessing parameter (mean, std): %s, %s', mean, std)
        logger.info('Preprocessed data (mean, std): %s, %s',
                    np.mean(x_train_preproc[:, i]),
                    np.std(x_train_preproc[:, i]))

    # Create model
    x_ph = tf.placeholder(tf.float64, shape=(None, len(cfg.ml_variables)))
    logits, f, w_vars = model(x_ph, len(cfg.ml_variables), 1, args.fold)

    # Build NLL loss
    y_ph = tf.placeholder(tf.float64, shape=(None, ))
    w_ph = tf.placeholder(tf.float64, shape=(None, ))

    nll = 0.0
    bins = np.array(cfg.analysis_binning)
    mu = tf.constant(1.0, tf.float64)
    nuisances = {}
    epsilon = tf.constant(1e-9, tf.float64)
    for i, (up, down) in enumerate(zip(bins[1:], bins[:-1])):
        logger.debug('Add NLL for bin {} with boundaries [{}, {}]'.format(
            i, down, up))
        up = tf.constant(up, tf.float64)
        down = tf.constant(down, tf.float64)

        # Processes
        mask = count_masking(f, up, down)
        procs = {}
        for j, name in enumerate(classes):
            proc_w = mask * tf.cast(tf.equal(y_ph, tf.constant(j, tf.float64)),
                                    tf.float64) * w_ph
            procs[name] = tf.reduce_sum(proc_w)

        # QCD estimation
        procs['qcd'] = procs['data_ss']
        for p in [n for n in cfg.ml_classes if not n in ['ggh', 'qqh']]:
            procs['qcd'] -= procs[p + '_ss']
        procs['qcd'] = tf.maximum(procs['qcd'], 0)

        # Nominal signal and background
        sig = 0
        for p in ['ggh', 'qqh']:
            sig += procs[p]

        bkg = 0
        for p in ['ztt', 'zl', 'w', 'tt', 'vv', 'qcd']:
            bkg += procs[p]

        # Normalization uncertainties
        sys = 0.0
        for n in nuisances:
            pass

        # Expectations
        obs = sig + bkg
        exp = mu * sig + bkg + sys

        # Likelihood
        nll -= tfp.distributions.Poisson(tf.maximum(exp, epsilon)).log_prob(
            tf.maximum(obs, epsilon))

    # Nuisance constraints
    for n in nuisances:
        nll -= tfp.distributions.Normal(
            loc=tf.constant(0.0, dtype=tf.float64),
            scale=tf.constant(1.0, dtype=tf.float64)).log_prob(nuisances[n])

    # Compute constraint of mu
    def get_constraint(nll, params):
        hessian = [
            tf.gradients(g, params)
            for g in tf.unstack(tf.gradients(nll, params))
        ]
        inverse = tf.matrix_inverse(hessian)
        covariance_poi = inverse[0][0]
        constraint = tf.sqrt(covariance_poi)
        return constraint

    loss_fullnll = get_constraint(nll,
                                  [mu] + [nuisances[n] for n in nuisances])
    loss_statsonly = get_constraint(nll, [mu])

    # Add minimization ops
    def get_minimize_op(loss):
        optimizer = tf.train.AdamOptimizer()
        return optimizer.minimize(loss, var_list=w_vars)

    minimize_fullnll = get_minimize_op(loss_fullnll)
    minimize_statsonly = get_minimize_op(loss_statsonly)

    # Train
    config = tf.ConfigProto(intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    session = tf.Session(config=config)
    session.run([tf.global_variables_initializer()])
    saver = tf.train.Saver(max_to_keep=1)

    patience = 10
    patience_count = patience
    min_loss = 1e9
    tolerance = 0.001
    step = 0
    validation_steps = 20
    warmup_steps = 100
    while True:
        if step < warmup_steps:
            loss = loss_statsonly
            minimize = minimize_statsonly
            is_warmup = True
        else:
            loss = loss_fullnll
            minimize = minimize_fullnll
            is_warmup = False

        loss_train, _ = session.run([loss, minimize],
                                    feed_dict={
                                        x_ph: x_train_preproc,
                                        y_ph: y_train,
                                        w_ph: w_train
                                    })

        if step % validation_steps == 0:
            logger.info('Step / patience: {} / {}'.format(
                step, patience_count))
            logger.info('Train loss: {:.5f}'.format(loss_train))
            loss_val = session.run(loss,
                                   feed_dict={
                                       x_ph: x_val_preproc,
                                       y_ph: y_val,
                                       w_ph: w_val
                                   })
            logger.info('Validation loss: {:.5f}'.format(loss_val))

            if is_warmup:
                logger.info('Warmup: {} / {}'.format(step, warmup_steps))
            else:
                if min_loss > loss_val and np.abs(
                        min_loss - loss_val) / min_loss > tolerance:
                    min_loss = loss_val
                    patience_count = patience
                    path = saver.save(session,
                                      os.path.join(
                                          args.workdir,
                                          'model_fold{}/model.ckpt'.format(
                                              args.fold)),
                                      global_step=step)
                    logger.info('Save model to {}'.format(path))
                else:
                    patience_count -= 1

                if patience_count == 0:
                    logger.info('Stop training')
                    break

        step += 1
Beispiel #11
0
def main(unused_argv):
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')

    if FLAGS.gpu_device is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(FLAGS.gpu_device)

    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True
    config = tf.estimator.RunConfig(
        model_dir=FLAGS.model_dir,
        session_config=session_config,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs)

    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config=config,
        pipeline_config_path=FLAGS.pipeline_config_path,
        train_steps=FLAGS.num_train_steps,
        sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
        sample_1_of_n_eval_on_train_examples=(
            FLAGS.sample_1_of_n_eval_on_train_examples))
    estimator = train_and_eval_dict['estimator']
    train_input_fn = train_and_eval_dict['train_input_fn']
    eval_input_fns = train_and_eval_dict['eval_input_fns']
    eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
    predict_input_fn = train_and_eval_dict['predict_input_fn']
    train_steps = train_and_eval_dict['train_steps']

    if FLAGS.checkpoint_dir:
        if FLAGS.eval_training_data:
            name = 'training_data'
            input_fn = eval_on_train_input_fn
        else:
            name = 'validation_data'
            # The first eval input will be evaluated.
            input_fn = eval_input_fns[0]
        if FLAGS.run_once:
            estimator.evaluate(input_fn,
                               steps=None,
                               checkpoint_path=tf.train.latest_checkpoint(
                                   FLAGS.checkpoint_dir))
        else:
            model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir,
                                      input_fn, train_steps, name,
                                      FLAGS.max_eval_retries)
    else:
        train_spec, eval_specs = model_lib.create_train_and_eval_specs(
            train_input_fn,
            eval_input_fns,
            eval_on_train_input_fn,
            predict_input_fn,
            train_steps,
            eval_on_train_data=False)

        # Multiple Eval Specs allowed.
        # TODO: Fix name of saving_listeners
        saving_listeners = [
            EvalCheckpointSaverListener(estimator, eval_specs[0].input_fn,
                                        'validation')
        ]
        if len(eval_specs) > 1:
            saving_listeners.append(
                EvalCheckpointSaverListener(estimator, eval_specs[1].input_fn,
                                            'training'))

        estimator.train(input_fn=train_spec.input_fn,
                        max_steps=train_spec.max_steps,
                        saving_listeners=saving_listeners)
Beispiel #12
0
    def benchmark_model(self,
                        warmup_runs,
                        bm_runs,
                        num_threads,
                        trace_filename=None):
        """Benchmark model."""
        if self.tensorrt:
            print('Using tensorrt ', self.tensorrt)
            self.build_and_save_model()
            graphdef = self.freeze_model()

        if num_threads > 0:
            print('num_threads for benchmarking: {}'.format(num_threads))
            sess_config = tf.ConfigProto(
                intra_op_parallelism_threads=num_threads,
                inter_op_parallelism_threads=1)
        else:
            sess_config = tf.ConfigProto()

        # rewriter_config_pb2.RewriterConfig.OFF
        sess_config.graph_options.rewrite_options.dependency_optimization = 2
        if self.use_xla:
            sess_config.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_2)

        with tf.Graph().as_default(), tf.Session(config=sess_config) as sess:
            inputs = tf.placeholder(tf.float32,
                                    name='input',
                                    shape=self.inputs_shape)
            output = self.build_model(inputs, is_training=False)

            img = np.random.uniform(size=self.inputs_shape)

            sess.run(tf.global_variables_initializer())
            if self.tensorrt:
                fetches = [inputs.name] + [i.name for i in output]
                goutput = self.convert_tr(graphdef, fetches)
                inputs, output = goutput[0], goutput[1:]

            if not self.use_xla:
                # Don't use tf.group because XLA removes the whole graph for tf.group.
                output = tf.group(*output)

            output_name = [output.name]
            input_name = inputs.name
            graphdef = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_name)

        with tf.Graph().as_default(), tf.Session(config=sess_config) as sess:
            tf.import_graph_def(graphdef, name='')

            for i in range(warmup_runs):
                start_time = time.time()
                sess.run(output_name, feed_dict={input_name: img})
                print('Warm up: {} {:.4f}s'.format(i,
                                                   time.time() - start_time))

            print('Start benchmark runs total={}'.format(bm_runs))
            start = time.perf_counter()
            for i in range(bm_runs):
                sess.run(output_name, feed_dict={input_name: img})
            end = time.perf_counter()
            inference_time = (end - start) / 10
            print('Per batch inference time: ', inference_time)
            print('FPS: ', self.batch_size / inference_time)

            if trace_filename:
                run_options = tf.RunOptions()
                run_options.trace_level = tf.RunOptions.FULL_TRACE
                run_metadata = tf.RunMetadata()
                sess.run(output_name,
                         feed_dict={input_name: img},
                         options=run_options,
                         run_metadata=run_metadata)
                logging.info('Dumping trace to %s', trace_filename)
                trace_dir = os.path.dirname(trace_filename)
                if not tf.io.gfile.exists(trace_dir):
                    tf.io.gfile.makedirs(trace_dir)
                with tf.io.gfile.GFile(trace_filename, 'w') as trace_file:
                    from tensorflow.python.client import timeline  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
                    trace = timeline.Timeline(
                        step_stats=run_metadata.step_stats)
                    trace_file.write(
                        trace.generate_chrome_trace_format(show_memory=True))
def eval_once(ckpnt):
    """Evaluate on one checkpoint once."""
    ptches = np.zeros((14, 14, 32, 32))
    for i in range(14):
        for j in range(14):
            ind_x = i * 2
            ind_y = j * 2
            for k in range(5):
                for h in range(5):
                    ptches[i, j, ind_x + k, ind_y + h] = 1
    ptches = np.reshape(ptches, (14 * 14, 32, 32))

    with tf.Graph().as_default():
        features = get_features(False, 1)[0]
        if FLAGS.patching:
            features['images'] = features['cc_images']
            features['recons_label'] = features['cc_recons_label']
            features['labels'] = features['cc_labels']
        model = f_model.multi_gpu_model
        result = model([features])
        # merged = result['summary']
        correct_prediction_sum = result['correct']
        # almost_correct_sum = result['almost']
        # mid_act = result['mid_act']
        logits = result['logits']

        saver = tf.train.Saver()
        test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test_once')
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.per_process_gpu_memory_fraction = 0.3
        sess = tf.Session(config=config)
        # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpnt))
        saver.restore(sess, ckpnt)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        i = 0
        try:
            total_tp = 0
            for i in range(FLAGS.eval_size):
                #, g_ac, ac
                lb, tp, lg = sess.run([
                    features['recons_label'],
                    correct_prediction_sum,
                    logits,
                ])
                if FLAGS.patching:
                    batched_lg = np.sum(lg / np.sum(lg, axis=1, keepdims=True),
                                        axis=0)
                    batch_pred = np.argmax(batched_lg)
                    tp = np.equal(batch_pred, lb[0])

                total_tp += tp
            total_false = FLAGS.eval_size - total_tp
            print('false:{}, true:{}'.format(total_false, total_tp))
            # summary_tp = tf.Summary.FromString(summary_j)
            # summary_tp.value.add(tag='correct_prediction', simple_value=total_tp)
            # summary_tp.value.add(tag='wrong_prediction', simple_value=total_false)
            # summary_tp.value.add(
            #     tag='almost_wrong_prediction', simple_value=total_almost_false)
            # test_writer.add_summary(summary_tp, i + 1)
        except tf.errors.OutOfRangeError:
            print('Done eval for %d steps.' % i)
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()
        # Wait for threads to finish.
        coord.join(threads)
        sess.close()
        test_writer.close()
def eval_ensemble(ckpnts):
    """Evaluate on an ensemble of checkpoints."""
    with tf.Graph().as_default():
        first_features = get_features(False, 100)[0]
        h = first_features['height']
        d = first_features['depth']
        features = {
            'images': tf.placeholder(tf.float32, shape=(100, d, h, h)),
            'labels': tf.placeholder(tf.float32, shape=(100, 10)),
            'recons_image': tf.placeholder(tf.float32, shape=(100, d, h, h)),
            'recons_label': tf.placeholder(tf.int32, shape=(100)),
            'height': first_features['height'],
            'depth': first_features['depth']
        }

        model = f_model.multi_gpu_model
        result = model([features])
        logits = result['logits']
        config = tf.ConfigProto(allow_soft_placement=True)
        # saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpnt))
        batch_logits = np.zeros((FLAGS.eval_size // 100, 100, 10),
                                dtype=np.float32)
        batch_recons_label = np.zeros((FLAGS.eval_size // 100, 100),
                                      dtype=np.float32)
        batch_labels = np.zeros((FLAGS.eval_size // 100, 100, 10),
                                dtype=np.float32)
        batch_images = np.zeros((FLAGS.eval_size // 100, 100, d, h, h),
                                dtype=np.float32)
        batch_recons_image = np.zeros((FLAGS.eval_size // 100, 100, d, h, h),
                                      dtype=np.float32)
        saver = tf.train.Saver()
        sess = tf.Session(config=config)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        try:
            for i in range(FLAGS.eval_size // 100):
                (batch_recons_label[i, Ellipsis], batch_labels[i, Ellipsis],
                 batch_images[i, Ellipsis],
                 batch_recons_image[i, Ellipsis]) = sess.run([
                     first_features['recons_label'], first_features['labels'],
                     first_features['images'], first_features['recons_image']
                 ])
            for ckpnt in ckpnts:
                saver.restore(sess, ckpnt)
                for i in range(FLAGS.eval_size // 100):
                    logits_i = sess.run(logits,
                                        feed_dict={
                                            features['recons_label']:
                                            batch_recons_label[i, Ellipsis],
                                            features['labels']:
                                            batch_labels[i, Ellipsis],
                                            features['images']:
                                            batch_images[i, Ellipsis],
                                            features['recons_image']:
                                            batch_recons_image[i, Ellipsis]
                                        })
                    # batch_logits[i, ...] += softmax(logits_i)
                    batch_logits[i, Ellipsis] += logits_i
        except tf.errors.OutOfRangeError:
            print('Done eval for %d steps.' % i)
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()
            # Wait for threads to finish.
        coord.join(threads)
        sess.close()
        batch_pred = np.argmax(batch_logits, axis=2)
        total_wrong = np.sum(np.not_equal(batch_pred, batch_recons_label))
        print(total_wrong)
Beispiel #15
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.data_type == "onehop":
        dataset_class = input_fns.OneHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif FLAGS.data_type == "twohop":
        dataset_class = input_fns.TwoHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif FLAGS.data_type == "threehop":
        dataset_class = input_fns.ThreeHopDataset
        eval_fn = evaluate.multihop_eval_fn
    elif (FLAGS.data_type == "wikimovie" or FLAGS.data_type == "wikimovie-2hop"
          or FLAGS.data_type == "wikimovie-3hop"):
        dataset_class = input_fns.WikiMovieDataset
        eval_fn = evaluate.wikimovie_eval_fn
    elif FLAGS.data_type == "hotpotqa":
        dataset_class = input_fns.HotpotQADataset
        eval_fn = evaluate.hotpot_eval_fn
    if FLAGS.model_type == "onehop":
        create_model_fn = model_fns.create_onehop_model
    elif FLAGS.model_type == "twohop":
        create_model_fn = model_fns.create_twohop_model
    elif FLAGS.model_type == "twohop-cascaded":
        create_model_fn = model_fns.create_twohopcascade_model
    elif FLAGS.model_type == "threehop":
        create_model_fn = functools.partial(model_fns.create_twohop_model,
                                            num_hops=3)
    elif FLAGS.model_type == "threehop-cascaded":
        create_model_fn = functools.partial(
            model_fns.create_twohopcascade_model, num_hops=3)
    elif FLAGS.model_type == "wikimovie":
        create_model_fn = model_fns.create_wikimovie_model
    elif FLAGS.model_type == "wikimovie-2hop":
        create_model_fn = functools.partial(model_fns.create_wikimovie_model,
                                            num_hops=2)
    elif FLAGS.model_type == "wikimovie-3hop":
        create_model_fn = functools.partial(model_fns.create_wikimovie_model,
                                            num_hops=3)
    elif FLAGS.model_type == "hotpotqa":
        create_model_fn = functools.partial(model_fns.create_hotpotqa_model,
                                            num_hops=FLAGS.num_hops)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    # Load mention and entity files.
    mention2text = json.load(
        tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "mention2text.json")))
    tf.logging.info("Loading metadata about entities and mentions...")
    entity2id, entity2name = json.load(
        tf.gfile.Open(os.path.join(FLAGS.train_data_dir, "entities.json")))
    entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()}
    # all_paragraphs = json.load(tf.gfile.Open(os.path.join(
    #     FLAGS.train_data_dir, "subparas.json")))
    # all_mentions = np.load(tf.gfile.Open(os.path.join(
    #     FLAGS.train_data_dir, "mentions.npy")))
    all_paragraphs = None
    all_mentions = None

    qa_config = QAConfig(
        qry_layers_to_use=FLAGS.qry_layers_to_use,
        qry_aggregation_fn=FLAGS.qry_aggregation_fn,
        dropout=FLAGS.question_dropout,
        qry_num_layers=FLAGS.question_num_layers,
        projection_dim=FLAGS.projection_dim,
        load_only_bert=FLAGS.load_only_bert,
        num_entities=len(entity2id),
        max_entity_len=FLAGS.max_entity_len,
        ensure_answer_sparse=FLAGS.ensure_answer_sparse,
        ensure_answer_dense=FLAGS.ensure_answer_dense,
        train_with_sparse=FLAGS.train_with_sparse,
        predict_with_sparse=FLAGS.predict_with_sparse,
        fix_sparse_to_one=FLAGS.fix_sparse_to_one,
        supervision=FLAGS.supervision,
        l2_normalize_db=FLAGS.l2_normalize_db,
        entity_score_aggregation_fn=FLAGS.entity_score_aggregation_fn,
        entity_score_threshold=FLAGS.entity_score_threshold,
        softmax_temperature=FLAGS.softmax_temperature,
        sparse_reduce_fn=FLAGS.sparse_reduce_fn,
        intermediate_loss=FLAGS.intermediate_loss,
        light=FLAGS.light,
        sparse_strategy=FLAGS.sparse_strategy,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    mips_config = MIPSConfig(ckpt_path=os.path.join(FLAGS.train_data_dir,
                                                    "mention_feats"),
                             ckpt_var_name="db_emb",
                             num_mentions=len(mention2text),
                             emb_size=FLAGS.projection_dim * 2,
                             num_neighbors=FLAGS.num_mips_neighbors)

    validate_flags_or_throw()

    tf.gfile.MakeDirs(FLAGS.output_dir)

    if FLAGS.do_train:
        json.dump(
            tf.app.flags.FLAGS.flag_values_dict(),
            tf.gfile.Open(os.path.join(FLAGS.output_dir, "flags.json"), "w"))

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=8,
        tpu_config=tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        session_config=tf.ConfigProto(log_device_placement=False))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_dataset = dataset_class(
            in_file=FLAGS.train_file,
            tokenizer=tokenizer,
            subject_mention_probability=FLAGS.subject_mention_probability,
            max_qry_length=FLAGS.max_query_length,
            is_training=True,
            entity2id=entity2id,
            tfrecord_filename=os.path.join(FLAGS.output_dir,
                                           "train.tf_record"))
        num_train_steps = int(train_dataset.num_examples /
                              FLAGS.train_batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    summary_obj = None
    model_fn = model_fn_builder(
        bert_config=bert_config,
        qa_config=qa_config,
        mips_config=mips_config,
        init_checkpoint=FLAGS.init_checkpoint,
        e2m_checkpoint=os.path.join(FLAGS.train_data_dir, "ent2ment.npz"),
        m2e_checkpoint=os.path.join(FLAGS.train_data_dir, "coref.npz"),
        entity_id_checkpoint=os.path.join(FLAGS.train_data_dir, "entity_ids"),
        entity_mask_checkpoint=os.path.join(FLAGS.train_data_dir,
                                            "entity_mask"),
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_tpu,
        create_model_fn=create_model_fn,
        summary_obj=summary_obj)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num orig examples = %d", train_dataset.num_examples)
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train(train_dataset, estimator, num_train_steps)

    if FLAGS.do_predict:
        eval_dataset = dataset_class(in_file=FLAGS.predict_file,
                                     tokenizer=tokenizer,
                                     subject_mention_probability=0.0,
                                     max_qry_length=FLAGS.max_query_length,
                                     is_training=False,
                                     entity2id=entity2id,
                                     tfrecord_filename=os.path.join(
                                         FLAGS.output_dir, "eval.tf_record"))

        continuous_eval(eval_dataset,
                        estimator,
                        mention2text,
                        entityid2name,
                        qa_config.supervision,
                        eval_fn,
                        paragraphs=all_paragraphs,
                        mentions=all_mentions)

    if FLAGS.do_test:
        # Load mention and entity files.
        mention2text = json.load(
            tf.gfile.Open(
                os.path.join(FLAGS.test_data_dir, "mention2text.json")))
        entity2id, entity2name = json.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "entities.json")))
        entityid2name = {str(i): entity2name[e] for e, i in entity2id.items()}
        all_paragraphs = json.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "subparas.json")))
        all_mentions = np.load(
            tf.gfile.Open(os.path.join(FLAGS.test_data_dir, "mentions.npy")))

        qa_config.num_entities = len(entity2id)
        mips_config = MIPSConfig(ckpt_path=os.path.join(
            FLAGS.test_data_dir, "mention_feats"),
                                 ckpt_var_name="db_emb",
                                 num_mentions=len(mention2text),
                                 emb_size=FLAGS.projection_dim * 2,
                                 num_neighbors=FLAGS.num_mips_neighbors)

        model_fn = model_fn_builder(
            bert_config=bert_config,
            qa_config=qa_config,
            mips_config=mips_config,
            init_checkpoint=FLAGS.init_checkpoint,
            e2m_checkpoint=os.path.join(FLAGS.test_data_dir, "ent2ment.npz"),
            m2e_checkpoint=os.path.join(FLAGS.test_data_dir, "coref.npz"),
            entity_id_checkpoint=os.path.join(FLAGS.test_data_dir,
                                              "entity_ids"),
            entity_mask_checkpoint=os.path.join(FLAGS.test_data_dir,
                                                "entity_mask"),
            learning_rate=FLAGS.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=FLAGS.use_tpu,
            use_one_hot_embeddings=FLAGS.use_tpu,
            create_model_fn=create_model_fn,
            summary_obj=summary_obj)
        estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=FLAGS.train_batch_size,
            predict_batch_size=FLAGS.predict_batch_size)

        eval_dataset = dataset_class(in_file=FLAGS.test_file,
                                     tokenizer=tokenizer,
                                     subject_mention_probability=0.0,
                                     max_qry_length=FLAGS.max_query_length,
                                     is_training=False,
                                     entity2id=entity2id,
                                     tfrecord_filename=os.path.join(
                                         FLAGS.output_dir, "test.tf_record"))

        if tf.gfile.Exists(os.path.join(FLAGS.output_dir, "best_model.meta")):
            ckpt_path = os.path.join(FLAGS.output_dir, "best_model")
        else:
            ckpt_path = None
        output_prediction_file = os.path.join(FLAGS.output_dir,
                                              "test_predictions.json")
        metrics = single_eval(eval_dataset,
                              estimator,
                              ckpt_path,
                              mention2text,
                              entityid2name,
                              qa_config.supervision,
                              output_prediction_file,
                              eval_fn,
                              paragraphs=all_paragraphs,
                              mentions=all_mentions)
        with tf.gfile.Open(os.path.join(FLAGS.output_dir, "test_metrics.txt"),
                           "w") as fo:
            for metric, value in metrics.items():
                tf.logging.info("%s: %.4f", metric, value)
                fo.write("%s %.4f\n" % (metric, value))
Beispiel #16
0
    def __init__(self, **kwargs):
        np.random.seed(0)
        tf.set_random_seed(0)

        self.batch_size = kwargs.pop('batch_size')
        self.data_sets = kwargs.pop('data_sets')
        self.train_dir = kwargs.pop('train_dir', 'output')
        log_dir = kwargs.pop('log_dir', 'log')
        self.model_name = kwargs.pop('model_name')
        self.num_classes = kwargs.pop('num_classes')
        self.initial_learning_rate = kwargs.pop('initial_learning_rate')
        self.decay_epochs = kwargs.pop('decay_epochs')
        self.avextol = kwargs.pop('avextol')

        if 'keep_probs' in kwargs: self.keep_probs = kwargs.pop('keep_probs')
        else: self.keep_probs = None

        if 'mini_batch' in kwargs: self.mini_batch = kwargs.pop('mini_batch')
        else: self.mini_batch = True

        if 'damping' in kwargs: self.damping = kwargs.pop('damping')
        else: self.damping = 0.0

        if not os.path.exists(self.train_dir):
            os.makedirs(self.train_dir)

        # Initialize session
        os.environ["CUDA_VISIBLE_DEVICES"] = "1"
        gpu_options = tf.GPUOptions(allow_growth=True)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        # config = tf.ConfigProto()
        # self.sess = tf.Session(config=config)
        # K.set_session(self.sess)

        # Setup input
        self.input_placeholder, self.labels_placeholder = self.placeholder_inputs(
        )
        self.num_train_examples = self.data_sets.train.labels.shape[0]
        self.num_test_examples = self.data_sets.test.labels.shape[0]

        # Setup inference and training
        if self.keep_probs is not None:
            self.keep_probs_placeholder = tf.placeholder(tf.float32, shape=(2))
            self.logits = self.inference(self.input_placeholder,
                                         self.keep_probs_placeholder)
        elif hasattr(self, 'inference_needs_labels'):
            self.logits = self.inference(self.input_placeholder,
                                         self.labels_placeholder)
        else:
            self.logits = self.inference(self.input_placeholder)

        self.total_loss, self.loss_no_reg, self.indiv_loss_no_reg = self.loss(
            self.logits, self.labels_placeholder)

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.learning_rate = tf.Variable(self.initial_learning_rate,
                                         name='learning_rate',
                                         trainable=False)
        self.learning_rate_placeholder = tf.placeholder(tf.float32)
        self.update_learning_rate_op = tf.assign(
            self.learning_rate, self.learning_rate_placeholder)

        # self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        # self.train_op = self.optimizer.minimize(self.total_loss, global_step=self.global_step)

        self.train_op, self.reset_optimizer_op = self.get_train_op(
            self.total_loss, self.global_step, self.learning_rate)
        self.train_sgd_op = self.get_train_sgd_op(self.total_loss,
                                                  self.global_step,
                                                  self.learning_rate * 10)
        # self.train_op=self.train_sgd_op
        self.accuracy_op = self.get_accuracy_op(self.logits,
                                                self.labels_placeholder)
        self.preds = self.predictions(self.logits)

        # Setup misc
        self.saver = tf.train.Saver()

        # Setup gradients and Hessians
        self.params = self.get_all_params()
        self.grad_total_loss_op = tf.gradients(self.total_loss, self.params)
        self.grad_loss_no_reg_op = tf.gradients(self.loss_no_reg, self.params)
        self.grad_loss_r = tf.gradients(tf.squeeze(self.logits), self.params)
        self.v_placeholder = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in self.params
        ]
        self.u_placeholder = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in self.params
        ]

        self.hessian_vector = hessian_vector_product(self.total_loss,
                                                     self.params,
                                                     self.v_placeholder)

        self.grad_loss_wrt_input_op = tf.gradients(self.total_loss,
                                                   self.input_placeholder)

        # Because tf.gradients auto accumulates, we probably don't need the add_n (or even reduce_sum)
        self.influence_op = tf.add_n([
            tf.reduce_sum(tf.multiply(a, array_ops.stop_gradient(b)))
            for a, b in zip(self.grad_total_loss_op, self.v_placeholder)
        ])

        self.grad_influence_wrt_input_op = tf.gradients(
            self.influence_op, self.input_placeholder)

        self.checkpoint_file = os.path.join(self.train_dir,
                                            "%s-checkpoint" % self.model_name)

        self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(
            self.data_sets.train)
        self.all_test_feed_dict = self.fill_feed_dict_with_all_ex(
            self.data_sets.test)

        init = tf.global_variables_initializer()
        self.sess.run(init)

        self.vec_to_list = self.get_vec_to_list_fn()
        self.adversarial_loss, self.indiv_adversarial_loss = self.adversarial_loss(
            self.logits, self.labels_placeholder)
        if self.adversarial_loss is not None:
            self.grad_adversarial_loss_op = tf.gradients(
                self.adversarial_loss, self.params)
def main(args):
    print(args)
    tf.disable_eager_execution()
    if args.memory_limit:
        physical_devices = tf.config.list_physical_devices('GPU')
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        tf.config.experimental.set_virtual_device_configuration(
            physical_devices[0], [
                tf.config.experimental.VirtualDeviceConfiguration(
                    memory_limit=args.memory_limit)
            ])

    assert args.microbatches is None
    args.microbatches = args.batch_size

    data_fn = data.data_fn_dict[args.experiment][int(args.dummy_data)]
    kwargs = {
        'max_features': args.max_features,
        'max_len': args.max_len,
        'format': 'NHWC',
    }
    if args.dummy_data:
        kwargs['num_examples'] = args.batch_size * 2
    (train_data, train_labels), _ = data_fn(**kwargs)
    num_train_eg = train_data.shape[0]

    loss_fn = tf.nn.sparse_softmax_cross_entropy_with_logits
    if args.experiment == 'logreg':
        loss_fn = lambda labels, logits: tf.nn.sigmoid_cross_entropy_with_logits(
            labels=labels, logits=tf.squeeze(logits))
        train_labels = train_labels.astype('float32')

    model = partial(model_dict[args.experiment],
                    features=train_data,
                    max_features=args.max_features,
                    args=args)

    if args.use_xla:
        # Not sure which one of these two works, so I'll just use both
        assert os.environ['TF_XLA_FLAGS'] == '--tf_xla_auto_jit=2'
        session_config = tf.ConfigProto()
        session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2
        run_config = tf.estimator.RunConfig(session_config=session_config)
        print('Using XLA!')
    else:
        run_config = None
        print('NOT using XLA!')

    model_obj = tf.estimator.Estimator(model_fn=partial(
        nn_model_fn, model, loss_fn, args),
                                       config=run_config)
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={'x': train_data},
        y=train_labels,
        batch_size=args.batch_size,
        num_epochs=args.epochs,
        shuffle=True)

    steps_per_epoch = num_train_eg // args.batch_size
    timings = []
    for epoch in range(1, args.epochs + 1):
        start = time.perf_counter()
        model_obj.train(input_fn=train_input_fn, steps=steps_per_epoch)
        duration = time.perf_counter() - start
        print("Time Taken: ", duration)
        timings.append(duration)

        if args.dpsgd:
            # eps = compute_epsilon(epoch, num_train_eg, args)
            # print('For delta=1e-5, the current epsilon is: %.2f' % eps)
            print('Trained with DPSGD optimizer')
        else:
            print('Trained with vanilla non-private SGD optimizer')

    if not args.no_save:
        utils.save_runtimes(__file__.split('.')[0], args, timings)
    else:
        print('Not saving!')
    print('Done!')
Beispiel #18
0
def evaluate(test_list, checkpoint_dir):
    print('Running PRLNet -Evaluation!')
    save_dir_test = os.path.join("./output/results")
    exists_or_mkdir(save_dir_test)
    # --------------------------------- set model ---------------------------------
    # data fetched within range: [-1,1]
    input_imgs, target_imgs, num = input_producer(test_list,
                                                  in_channels,
                                                  batch_size,
                                                  need_shuffle=False)
    contents, details, pred_imgs = gen_PRLNet(input_imgs,
                                              out_channels,
                                              is_train=False,
                                              reuse=False)

    # --------------------------------- evaluation ---------------------------------
    # set GPU resources
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    #config.gpu_options.per_process_gpu_memory_fraction = 0.45

    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # Restore model weights from previously saved model
        check_pt = tf.train.get_checkpoint_state(checkpoint_dir)
        if check_pt and check_pt.model_checkpoint_path:
            saver.restore(sess, check_pt.model_checkpoint_path)
            print('model is loaded successfully.')
        else:
            print('# error: loading checkpoint failed.')
            return None

        cnt = 0
        psnr_list = []
        ssim_list = []
        start_time = time.time()
        while not coord.should_stop():
            tm = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
            print('%s evaluating: [%d - %d]' % (tm, cnt, cnt + batch_size))
            pd_images, gt_images = sess.run([pred_imgs, target_imgs])
            save_images_from_batch(pd_images, save_dir_test, cnt)
            psnr, ssim = measure_quality(pd_images, gt_images)
            psnr_list.append(psnr)
            ssim_list.append(ssim)
            cnt += batch_size
            if cnt >= num:
                coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)
        sess.close()
        print("Testing finished! consumes %f sec" % (time.time() - start_time))
        print("Numerical accuracy computing ...")
        # numerical evaluation
        mean_psnr = np.mean(np.array(psnr_list))
        stde_psnr = np.std(np.array(psnr_list))
        mean_ssim = np.mean(np.array(ssim_list))
        stde_ssim = np.std(np.array(ssim_list))
        save_path = os.path.join("./output/", "accuracy.txt")
        with open(save_path, 'w') as f:
            f.writelines('mean psnr:' + str(mean_psnr) + '\n')
            f.writelines('stde psnr:' + str(stde_psnr) + '\n\n')
            f.writelines('mean ssim:' + str(mean_ssim) + '\n')
            f.writelines('stde psnr:' + str(stde_ssim) + '\n')
        print("Done!")
Beispiel #19
0
def train(replication_factor, batch_size, batch_per_step, profile, num_iter,
          time_steps):
    """Launch training."""

    # Set up in-feeds for the data
    with tf.device('cpu'):
        data_generator = EnvGenerator(batch_size, time_steps)
        items = next(data_generator)
        output_types = tuple((tf.dtypes.as_dtype(i.dtype) for i in items))
        output_shapes = tuple((tf.TensorShape(i.shape) for i in items))
        total_bytes = 0
        for i in items:
            total_bytes += i.nbytes
        print(f'Input data size = {total_bytes/1000000} MB/batch')
        dataset = tf.data.Dataset.from_generator(data_generator,
                                                 output_types=output_types,
                                                 output_shapes=output_shapes)
        infeed_queue = ipu_infeed_queue.IPUInfeedQueue(
            dataset, "InfeedQueue", replication_factor=replication_factor)
        data_init = infeed_queue.initializer

    # Compile loss op
    with ipu_scope("/device:IPU:0"):
        total_loss = ipu_compiler.compile(
            lambda: loops.repeat(batch_per_step,
                                 build_train_op,
                                 infeed_queue=infeed_queue,
                                 inputs=[tf.constant(0.0, dtype=DTYPE)]))
    # Set up report op optionally.
    if profile:
        with tf.device('cpu'):
            report = gen_ipu_ops.ipu_event_trace()

    # Set up session on IPU
    opts = utils.create_ipu_config(
        profiling=profile,
        use_poplar_text_report=use_poplar_text_report,
        profile_execution=profile,
        merge_infeed_io_copies=True)
    opts = utils.set_optimization_options(
        opts, max_cross_replica_sum_buffer_size=10000000)
    opts = utils.auto_select_ipus(opts, [replication_factor])
    utils.configure_ipu_system(opts)
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=True))

    # Initialize variables
    utils.move_variable_initialization_to_cpu()
    sess.run([tf.global_variables_initializer(), data_init])

    # Run training and time
    total_time = 0.0
    total_samples = 0
    skip_iterations = 5  # Initially the infeed may buffer extra input data and
    # first run for IPU includes XLA compile, so skipping these iterations for calculating items/sec.
    for iters in range(num_iter):
        data_generator.reset_counter()
        t0 = time.perf_counter()
        sess.run(total_loss)
        t1 = time.perf_counter()

        if profile:
            raw_reports = sess.run(report)
            if use_poplar_text_report:
                # extract the report
                rep = utils.extract_all_strings_from_event_trace(raw_reports)
                print("Writing profiling report to %s" % report_dest)
                with open(report_dest, "w") as f:
                    f.write(rep)
            else:
                os.makedirs('profile_rl', exist_ok=True)
                save_tf_report(raw_reports, log_dir='profile_rl')
                print("Writing profiling report to profile_rl")
            break

        if iters > skip_iterations:
            total_time += (t1 - t0)
            total_samples += (batch_size * batch_per_step * replication_factor)
            print("Average %.1f items/sec" % (total_samples / total_time))
Beispiel #20
0
def train(train_list, val_list, debug_mode=True):
    print('Running PRLNet -Training!')
    # create folders to save trained model and results
    graph_dir = './graph'
    checkpt_dir = './model'
    ouput_dir = './output'
    exists_or_mkdir(graph_dir, need_remove=True)
    exists_or_mkdir(ouput_dir)
    exists_or_mkdir(checkpt_dir)

    # --------------------------------- load data ---------------------------------
    # data fetched at range: [-1,1]
    input_imgs, target_imgs, num = input_producer(train_list,
                                                  in_channels,
                                                  batch_size,
                                                  need_shuffle=True)
    if debug_mode:
        input_val, target_val, num_val = input_producer(val_list,
                                                        in_channels,
                                                        batch_size,
                                                        need_shuffle=False)

    pred_content, pred_detail, pred_imgs = gen_PRLNet(input_imgs,
                                                      out_channels,
                                                      is_train=True,
                                                      reuse=False)
    if debug_mode:
        _, _, pred_val = gen_PRLNet(input_val,
                                    out_channels,
                                    is_train=False,
                                    reuse=True)

    # --------------------------------- loss terms ---------------------------------
    with tf.name_scope('Loss') as loss_scp:
        target_224 = tf.image.resize_images(target_imgs,
                                            size=[224, 224],
                                            method=0,
                                            align_corners=False)
        predict_224 = tf.image.resize_images(pred_imgs,
                                             size=[224, 224],
                                             method=0,
                                             align_corners=False)
        vgg19_api = VGG19("../vgg19.npy")
        vgg_map_targets = vgg19_api.build((target_224 + 1) / 2,
                                          is_rgb=(in_channels == 3))
        vgg_map_predict = vgg19_api.build((predict_224 + 1) / 2,
                                          is_rgb=(in_channels == 3))

        content_loss = tf.losses.mean_squared_error(target_imgs, pred_content)
        vgg_loss = 2e-6 * tf.losses.mean_squared_error(vgg_map_targets,
                                                       vgg_map_predict)
        l1_loss = tf.reduce_mean(tf.abs(target_imgs - pred_imgs))
        mse_loss = tf.losses.mean_squared_error(target_imgs, pred_imgs)

        loss_op = content_loss + 2 * vgg_loss + l1_loss

    # --------------------------------- solver definition ---------------------------------
    global_step = tf.Variable(0, name='global_step', trainable=False)
    iters_per_epoch = np.floor_divide(num, batch_size)
    lr_decay = tf.train.polynomial_decay(
        learning_rate=learning_rate,
        global_step=global_step,
        decay_steps=iters_per_epoch * n_epochs,
        end_learning_rate=learning_rate / 100.0,
        power=0.9)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.name_scope('optimizer'):
        with tf.control_dependencies(update_ops):
            gen_vars = [
                var for var in tf.trainable_variables()
                if var.name.startswith("PRLNet")
            ]
            gen_optim = tf.train.AdamOptimizer(lr_decay, beta1)
            gen_grads_and_vars = gen_optim.compute_gradients(loss_op,
                                                             var_list=gen_vars)
            train_op = gen_optim.apply_gradients(gen_grads_and_vars,
                                                 global_step=global_step)

    # --------------------------------- model training ---------------------------------
    '''
    if debug_mode:
        with tf.name_scope('summarise') as sum_scope:
            tf.summary.scalar('loss', loss_op)
            tf.summary.scalar('learning rate', lr_decay)
            tf.summary.image('predicts', pred_imgs, max_outputs=9)
            summary_op = tf.summary.merge_all()
    '''

    with tf.name_scope("parameter_count"):
        num_parameters = tf.reduce_sum(
            [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    # set GPU resources
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    #config.gpu_options.per_process_gpu_memory_fraction = 0.45

    saver = tf.train.Saver(max_to_keep=1)
    loss_list = []
    psnr_list = []
    with tf.Session(config=config) as sess:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        sess.run(tf.global_variables_initializer())
        print(">>------------>>> [Training_Num] =%d" % num)
        print(">>------------>>> [Parameter_Num] =%d" %
              sess.run(num_parameters))
        '''
        if debug_mode:
            with tf.name_scope(sum_scope):
                summary_writer = tf.summary.FileWriter(graph_dir, graph=sess.graph)
        '''
        for epoch in range(0, n_epochs):
            start_time = time.time()
            epoch_loss, n_iters = 0, 0
            for step in range(0, num, batch_size):
                _, loss = sess.run([train_op, loss_op])
                epoch_loss += loss
                n_iters += 1
                # iteration information
                if n_iters % display_steps == 0:
                    tm = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S.%f')
                    print("%s >> [%d/%d] iter: %d  loss: %4.4f" %
                          (tm, epoch, n_epochs, n_iters, loss))
                    '''
                    if debug_mode:
                        summary_str = sess.run(summary_op)
                        summary_writer.add_summary(summary_str, step)
                    '''

            # epoch information
            epoch_loss = epoch_loss / n_iters
            loss_list.append(epoch_loss)
            print(
                "[*] ----- Epoch: %d/%d | Loss: %4.4f | Time-consumed: %4.3f -----"
                % (epoch, n_epochs, epoch_loss, (time.time() - start_time)))

            if (epoch + 1) % save_epochs == 0:
                if debug_mode:
                    print("----- validating model ...")
                    mean_psnr, nn = 0, 0
                    for idx in range(0, num_val, batch_size):
                        predicts, groundtruths = sess.run(
                            [pred_val, target_val])
                        save_images_from_batch(predicts, ouput_dir, idx)
                        psnr = measure_psnr(predicts, groundtruths)
                        mean_psnr += psnr
                        nn += 1
                    psnr_list.append(mean_psnr / nn)
                    print("----- psnr:%4.4f" % (mean_psnr / nn))

                print("----- saving model  ...")
                saver.save(sess,
                           os.path.join(checkpt_dir, "model.cpkt"),
                           global_step=global_step)
                save_list(os.path.join(ouput_dir, "loss"), loss_list)
                save_list(os.path.join(ouput_dir, "psnr"), psnr_list)

        # stop data queue
        coord.request_stop()
        coord.join(threads)
        # write out the loss list
        save_list(os.path.join(ouput_dir, "loss"), loss_list)
        save_list(os.path.join(ouput_dir, "psnr"), psnr_list)
        print("Training finished!")

    return None
Beispiel #21
0
  def export(self,
             output_dir: Text,
             tflite_path: Text = None,
             tensorrt: Text = None):
    """Export a saved model, frozen graph, and potential tflite/tensorrt model.

    Args:
      output_dir: the output folder for saved model.
      tflite_path: the path for saved tflite file.
      tensorrt: If not None, must be {'FP32', 'FP16', 'INT8'}.
    """
    signitures = self.signitures
    signature_def_map = {
        'serving_default':
            tf.saved_model.predict_signature_def(
                {signitures['image_arrays'].name: signitures['image_arrays']},
                {signitures['prediction'].name: signitures['prediction']}),
    }
    b = tf.saved_model.Builder(output_dir)
    b.add_meta_graph_and_variables(
        self.sess,
        tags=['serve'],
        signature_def_map=signature_def_map,
        assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
        clear_devices=True)
    b.save()
    logging.info('Model saved at %s', output_dir)

    # also save freeze pb file.
    graphdef = self.freeze()
    pb_path = os.path.join(output_dir, self.model_name + '_frozen.pb')
    tf.io.gfile.GFile(pb_path, 'wb').write(graphdef.SerializeToString())
    logging.info('Frozen graph saved at %s', pb_path)

    if tflite_path:
      height, width = utils.parse_image_size(self.params['image_size'])
      input_name = signitures['image_arrays'].op.name
      input_shapes = {input_name: [None, height, width, 3]}
      converter = tf.lite.TFLiteConverter.from_saved_model(
          output_dir,
          input_arrays=[input_name],
          input_shapes=input_shapes,
          output_arrays=[signitures['prediction'].op.name])
      converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]
      tflite_model = converter.convert()

      tf.io.gfile.GFile(tflite_path, 'wb').write(tflite_model)
      logging.info('TFLite is saved at %s', tflite_path)

    if tensorrt:
      from tensorflow.python.compiler.tensorrt import trt  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
      sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
      trt_path = os.path.join(output_dir, 'tensorrt_' + tensorrt.lower())
      trt.create_inference_graph(
          None,
          None,
          precision_mode=tensorrt,
          input_saved_model_dir=output_dir,
          output_saved_model_dir=trt_path,
          session_config=sess_config)
      logging.info('TensorRT model is saved at %s', trt_path)
Beispiel #22
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('checkpoint_path', help='Path to checkpoint to load')
    parser.add_argument('--input-size',
                        type=int,
                        default=256,
                        help='Shape of input to use (depends on checkpoint)')
    parser.add_argument('--inter',
                        nargs='+',
                        type=int,
                        help='Interpolate between the 4 style given')
    arguments = parser.parse_args()

    style_control = []
    style_inter = arguments.inter
    if not style_inter:
        for style_index in range(16):
            style_control.append([0.0] * 16)
            style_control[-1][style_index] = 1
    else:
        for col in range(4):
            for row in range(4):
                style_index = (col % 4) + (row * 4)
                style_control.append([0.0] * 16)
                # top left style
                style_control[-1][style_inter[0]] = ((3 - row) / 3) * (
                    (3 - col) / 3)
                # top right style
                style_control[-1][style_inter[1]] = (row / 3) * ((3 - col) / 3)
                # bottom left style
                style_control[-1][style_inter[2]] = ((3 - row) / 3) * (col / 3)
                # bottom right style
                style_control[-1][style_inter[3]] = (row / 3) * (col / 3)
    style_control = np.asarray(style_control, dtype=np.float)

    capture = cv2.VideoCapture(-1)

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    import tensorflow as tf
    if tf.__version__.split('.')[0] == '2':
        import tensorflow.compat.v1 as tf
        tf.disable_v2_behavior()
    import tensorflow.compat.v1 as tf1
    from engine_multi import EngineMultiStyle

    gpu_options = tf1.GPUOptions(allow_growth=True)
    session_config = tf1.ConfigProto(gpu_options=gpu_options)
    with tf1.Session(config=session_config).as_default() as session:
        input_size = arguments.input_size
        engine = EngineMultiStyle(session, input_size,
                                  arguments.checkpoint_path)

        mosaic = np.zeros((4 * input_size, 4 * input_size, 3), dtype=np.uint8)
        while (True):
            # Capture frame-by-frame
            _, frame = capture.read()

            frame = cv2.resize(frame,
                               (arguments.input_size, arguments.input_size))
            input_image = np.asarray(frame)

            outputs = engine.predict([input_image] * 16, style_control)
            for row in range(4):
                for col in range(4):
                    mosaic[col * input_size:(col + 1) * input_size,
                           row * input_size:(row + 1) *
                           input_size] = outputs[(4 * col) + row]

            # Display the resulting frame
            cv2.imshow('original', input_image)
            cv2.imshow('style', mosaic)
            key_pressed = cv2.waitKey(1) & 0xFF
            if key_pressed == ord('q'):
                break

    # When everything done, release the capture
    capture.release()
    cv2.destroyAllWindows()
Beispiel #23
0
def train(config):
    Model_cls = HandwritingVRNNGmmModel
    Dataset_cls = HandWritingDatasetConditionalTF

    # Dataset
    training_dataset = Dataset_cls(config['training_data'],
                                   use_bow_labels=config['use_bow_labels'])

    num_training_iterations = int(training_dataset.num_samples /
                                  config['batch_size'])
    print("# training steps per epoch: " + str(num_training_iterations))

    # Create a tensorflow sub-graph that loads batches of samples.
    if config.get('use_bucket_feeder', True) and training_dataset.is_dynamic:
        bucket_edges = training_dataset.get_seq_len_histogram(
            num_bins=15, collapse_first_and_last_bins=[2, -2])
        data_feeder = DataFeederTF(training_dataset,
                                   config['num_epochs'],
                                   config['batch_size'],
                                   queue_capacity=1024)

        sequence_length, inputs, targets = data_feeder.batch_queue_bucket(
            bucket_edges,
            dynamic_pad=training_dataset.is_dynamic,
            queue_capacity=300,
            queue_threads=4)
    else:
        # Training data
        data_feeder = DataFeederTF(training_dataset,
                                   config['num_epochs'],
                                   config['batch_size'],
                                   queue_capacity=1024)
        sequence_length, inputs, targets = data_feeder.batch_queue(
            dynamic_pad=training_dataset.is_dynamic,
            queue_capacity=512,
            queue_threads=4)

    if config.get('use_staging_area', False):
        staging_area = TFStagingArea([sequence_length, inputs, targets],
                                     device_name="/gpu:0")
        sequence_length, inputs, targets = staging_area.tensors

    # Create step counter (used by optimization routine and learning rate function.)
    global_step = tf.compat.v1.get_variable(name='global_step',
                                            trainable=False,
                                            initializer=1)

    # Annealing KL-divergence loss.
    kld_loss_weight_backup = config['loss_weights']['kld_loss']
    if type(config['loss_weights']['kld_loss']) == np.ndarray:
        # Create a piecewise increasing kld loss weight.
        num_steps = len(config['loss_weights']['kld_loss'])
        values = np.linspace(0, 1, num_steps + 1).tolist()
        boundaries = (config['loss_weights']['kld_loss'] *
                      num_training_iterations).tolist()

        config['loss_weights']['kld_loss'] = tf.train.piecewise_constant(
            global_step, boundaries=boundaries, values=values)
        tf.summary.scalar('training/kld_loss_weight',
                          config['loss_weights']['kld_loss'],
                          collections=["training_status"])

    # Create training graph.
    with tf.name_scope("training"):
        model = Model_cls(config,
                          reuse=False,
                          input_op=inputs,
                          target_op=targets,
                          input_seq_length_op=sequence_length,
                          input_dims=training_dataset.input_dims,
                          target_dims=training_dataset.target_dims,
                          mode="training",
                          data_processor=training_dataset)

        model.build_graph()
        model.create_image_summary(training_dataset.prepare_for_visualization)

    # Create sampling graph.
    with tf.name_scope("sampling"):
        sampling_input_op = tf.compat.v1.placeholder(
            tf.float32,
            shape=[
                1, training_dataset.sequence_length,
                sum(training_dataset.input_dims)
            ])
        sampling_sequence_length_op = tf.compat.v1.placeholder(tf.int32,
                                                               shape=[1])
        sampling_model = Model_cls(
            config,
            reuse=True,
            input_op=sampling_input_op,
            target_op=None,
            input_seq_length_op=sampling_sequence_length_op,
            input_dims=training_dataset.input_dims,
            target_dims=training_dataset.target_dims,
            batch_size=1,
            mode="sampling",
            data_processor=training_dataset)
        sampling_model.build_graph()
        sampling_model.create_image_summary(
            training_dataset.prepare_for_visualization)

    # Validation model.
    if config.get('validate_model', False):
        validation_dataset = Dataset_cls(
            config['validation_data'], use_bow_labels=config['use_bow_labels'])

        num_validation_iterations = int(validation_dataset.num_samples /
                                        config['batch_size'])
        print("# validation steps per epoch: " +
              str(num_validation_iterations))

        valid_data_feeder = DataFeederTF(validation_dataset,
                                         config['num_epochs'],
                                         config['batch_size'],
                                         queue_capacity=1024,
                                         shuffle=False)
        valid_sequence_length, valid_inputs, valid_targets = valid_data_feeder.batch_queue(
            dynamic_pad=validation_dataset.is_dynamic,
            queue_capacity=512,
            queue_threads=4)

        if 'use_staging_area' in config and config['use_staging_area']:
            valid_staging_area = TFStagingArea(
                [valid_sequence_length, valid_inputs, valid_targets],
                device_name="/gpu:0")
            valid_sequence_length, valid_inputs, valid_targets = valid_staging_area.tensors

        with tf.name_scope("validation"):
            valid_model = Model_cls(config,
                                    reuse=True,
                                    input_op=valid_inputs,
                                    target_op=valid_targets,
                                    input_seq_length_op=valid_sequence_length,
                                    input_dims=validation_dataset.input_dims,
                                    target_dims=validation_dataset.target_dims,
                                    mode="training",
                                    data_processor=validation_dataset)
            valid_model.build_graph()

    # Create a session object and initialize parameters.
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                            allow_soft_placement=True))

    if config['learning_rate_type'] == 'exponential':
        learning_rate = tf.train.exponential_decay(
            config['learning_rate'],
            global_step=global_step,
            decay_steps=config['learning_rate_decay_steps'],
            decay_rate=config['learning_rate_decay_rate'],
            staircase=False)
        tf.summary.scalar('training/learning_rate',
                          learning_rate,
                          collections=["training_status"])
    elif config['learning_rate_type'] == 'fixed':
        learning_rate = config['learning_rate']
    else:
        raise Exception("Invalid learning rate type")

    optimizer = tf.train.AdamOptimizer(learning_rate)
    # Gradient clipping and a sanity check.
    grads = list(
        zip(tf.gradients(model.loss, tf.trainable_variables()),
            tf.trainable_variables()))
    grads_clipped = []
    with tf.name_scope("grad_clipping"):
        for grad, var in grads:
            if grad is not None:
                if config['grad_clip_by_norm'] > 0:
                    grads_clipped.append(
                        (tf.clip_by_norm(grad,
                                         config['grad_clip_by_norm']), var))
                elif config['grad_clip_by_value'] > 0:
                    grads_clipped.append(
                        (tf.clip_by_value(grad, -config['grad_clip_by_value'],
                                          -config['grad_clip_by_value']), var))
                else:
                    grads_clipped.append((grad, var))
    train_op = optimizer.apply_gradients(grads_and_vars=grads_clipped,
                                         global_step=global_step)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)

    run_opts = None
    run_opts_metadata = None
    if config.get('create_timeline', False):
        run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
                                 timeout_in_ms=100000)
        run_opts_metadata = tf.RunMetadata()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
    if config['model_dir']:
        # If model directory already exists, continue training by restoring computation graph.
        # Restore variables.
        if config['checkpoint_id'] is None:
            checkpoint_path = tf.train.latest_checkpoint(config['model_dir'])
        else:
            checkpoint_path = os.path.join(config['model_dir'],
                                           config['checkpoint_id'])

        print("Continue training with model " + checkpoint_path)
        saver.restore(sess, checkpoint_path)

        step = tf.train.global_step(sess, global_step)
        start_epoch = round(
            step / (training_dataset.num_samples / config['batch_size']))
    else:
        # Fresh start
        # Create a unique output directory for this experiment.
        config['model_dir'] = get_model_dir_timestamp(
            base_path=config['model_save_dir'],
            prefix="tf",
            suffix=config['experiment_name'],
            connector="-")
        print("Saving to {}\n".format(config['model_dir']))
        start_epoch = 1
        step = 1

    coord = tf.train.Coordinator()
    data_feeder.init(
        sess, coord
    )  # Enqueue threads must be initialized after definition of train_op.
    if config.get('validate_model', False):
        valid_data_feeder.init(sess, coord)
    queue_threads = tf.train.start_queue_runners(coord=coord, sess=sess)
    queue_threads.append(data_feeder.enqueue_threads)

    # Register and create summary ops.
    summary_dir = os.path.join(config['model_dir'], "summary")
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Create summaries to visualize weights and gradients.
    if config['tensorboard_verbose'] > 1:
        for grad, var in grads:
            tf.summary.histogram(var.name,
                                 var,
                                 collections=["training_status"])
            tf.summary.histogram(var.name + '/gradient',
                                 grad,
                                 collections=["training_status"])

    if config['tensorboard_verbose'] > 1:
        tf.summary.scalar(
            "training/queue",
            math_ops.cast(data_feeder.input_queue.size(), dtypes.float32) *
            (1. / data_feeder.queue_capacity),
            collections=["training_status"])

    # Save configuration
    config['loss_weights']['kld_loss'] = kld_loss_weight_backup
    try:
        # Pickle and json dump.
        pickle.dump(
            config, open(os.path.join(config['model_dir'], 'config.pkl'),
                         'wb'))
        json.dump(config,
                  open(os.path.join(config['model_dir'], 'config.json'), 'w'),
                  indent=4,
                  sort_keys=True)
    except:
        pass

    training_summary = tf.compat.v1.summary.merge_all('training_status')
    training_run_ops = [
        model.loss_summary, training_summary, model.ops_loss, train_op
    ]
    training_run_ops_with_img_summary = [
        model.loss_summary, training_summary, model.ops_loss,
        model.ops_img_summary, train_op
    ]

    if config.get('validate_model', False):
        validation_run_ops = [valid_model.ops_loss]

    if config['use_staging_area']:
        training_run_ops.append(staging_area.preload_op)
        training_run_ops_with_img_summary.append(staging_area.preload_op)
        # Fill staging area first.
        for i in range(256):
            _ = sess.run(staging_area.preload_op,
                         feed_dict={},
                         options=run_opts,
                         run_metadata=run_opts_metadata)

        if config.get('validate_model', False):
            validation_run_ops.append(valid_staging_area.preload_op)
            # Fill staging area first.
            for i in range(256):
                _ = sess.run(valid_staging_area.preload_op,
                             feed_dict={},
                             options=run_opts,
                             run_metadata=run_opts_metadata)

    for epoch in range(start_epoch, config['num_epochs'] + 1):
        for epoch_step in range(num_training_iterations):
            start_time = time.perf_counter()
            step = tf.train.global_step(sess, global_step)

            if (step % config['checkpoint_every_step']) == 0:
                ckpt_save_path = saver.save(
                    sess, os.path.join(config['model_dir'], 'model'),
                    global_step)
                print("Model saved in file: %s" % ckpt_save_path)

            if config['img_summary_every_step'] > 0 and step % config[
                    'img_summary_every_step'] == 0:
                run_training_output = sess.run(
                    training_run_ops_with_img_summary,
                    feed_dict={},
                    options=run_opts,
                    run_metadata=run_opts_metadata)

                img_summary = model.get_image_summary(
                    sess,
                    ops_img_summary_evaluated=run_training_output[3],
                    seq_len=500)
                summary_writer.add_summary(img_summary, step)
            else:
                run_training_output = sess.run(training_run_ops,
                                               feed_dict={},
                                               options=run_opts,
                                               run_metadata=run_opts_metadata)

            summary_writer.add_summary(run_training_output[0],
                                       step)  # Loss summary
            summary_writer.add_summary(run_training_output[1],
                                       step)  # Training status summary.

            if step % config['print_every_step'] == 0:
                time_elapsed = (time.perf_counter() -
                                start_time) / config['print_every_step']
                model.log_loss(run_training_output[2], step, epoch,
                               time_elapsed)

            if config['img_summary_every_step'] > 0 and step % config[
                    'img_summary_every_step'] == 0:
                sampling_img_summary = sampling_model.get_image_summary(
                    sess, ops_img_summary_evaluated=None, seq_len=500)
                summary_writer.add_summary(sampling_img_summary, step)

            if config.get('validate_model',
                          False) and step % config['validate_every_step'] == 0:
                start_time = time.perf_counter()
                for i in range(num_validation_iterations):
                    run_validation_output = sess.run(
                        validation_run_ops,
                        feed_dict={},
                        options=run_opts,
                        run_metadata=run_opts_metadata)
                    valid_model.update_validation_loss(
                        run_validation_output[0])

                valid_summary, valid_eval_loss = valid_model.get_validation_summary(
                    session=sess)
                summary_writer.add_summary(valid_summary,
                                           step)  # Validation loss summary

                time_elapsed = (time.perf_counter() -
                                start_time) / num_validation_iterations
                valid_model.log_loss(valid_eval_loss,
                                     step,
                                     epoch,
                                     time_elapsed,
                                     prefix="VALID: ")
                valid_model.reset_validation_loss()

            if config.get('create_timeline', False):
                create_tf_timeline(config['model_dir'], run_opts_metadata)

    print("End-of-Training.")
    ckpt_save_path = saver.save(sess, os.path.join(config['model_dir'],
                                                   'model'), global_step)
    print("Model saved in file: %s" % ckpt_save_path)
    print('Model is trained for %d epochs, %d steps.' %
          (config['num_epochs'], step))

    try:
        sess.run(data_feeder.input_queue.close(cancel_pending_enqueues=True))
        coord.request_stop()
        coord.join(queue_threads, stop_grace_period_secs=5)
    except:
        pass

    sess.close()
    def __init__(self):
        # 误差图
        def plotloss():
            plt.figure()
            ax = plt.gca()
            y1 = R_variable['loss_test']
            y2 = R_variable['loss_train']
            plt.plot(y1, 'ro', label='Test')
            plt.plot(y2, 'g*', label='Train')
            # ax.set_xscale('log')
            ax.set_yscale('log')
            plt.legend(fontsize=18)
            plt.xlabel('Epoch', fontsize=15)
            plt.title('loss', fontsize=15)
            fntmp = '%sloss' % (self.FolderName)
            mySaveFig(plt, fntmp, ax=ax, isax=1, iseps=0)

        def plotacc():
            plt.figure()
            ax = plt.gca()
            y1 = R_variable['acc_test']
            y2 = R_variable['acc_train']
            plt.plot(y1, 'ro', label='Test')
            plt.plot(y2, 'g*', label='Train')
            # ax.set_xscale('log')
            # ax.set_yscale('log')
            plt.legend(fontsize=18)
            plt.xlabel('Epoch', fontsize=15)
            plt.title('accuracy', fontsize=15)
            fntmp = '%saccuracy' % (self.FolderName)
            mySaveFig(plt, fntmp, ax=ax, isax=1, iseps=0)

        # 保存文件
        def savefile():
            # 序列化变量R, 需要的话可以load出来
            with open('%s/objs.pkl' % (self.FolderName),
                      'wb') as f:  # Python 3: open(..., 'wb')
                pickle.dump(R_variable, f, protocol=4)

            # 保存变量R参数长度小于等于20的
            text_file = open("%s/Output.txt" % (self.FolderName), "w")
            for para in R_variable:
                if np.size(R_variable[para]) > 20:
                    continue
                text_file.write('%s: %s\n' % (para, R_variable[para]))
            text_file.close()

            # 保存loss到csv中
            da = pd.DataFrame(R_variable['loss_train'])
            da.to_csv(self.FolderName + "loss_train" + ".csv",
                      header=False,
                      columns=None)
            db = pd.DataFrame(R_variable['loss_test'])
            db.to_csv(self.FolderName + "loss_test" + ".csv",
                      header=False,
                      columns=None)
            dc = pd.DataFrame(R_variable['acc_train'])
            dc.to_csv(self.FolderName + "acc_train" + ".csv",
                      header=False,
                      columns=None)
            dd = pd.DataFrame(R_variable['acc_test'])
            dd.to_csv(self.FolderName + "acc_test" + ".csv",
                      header=False,
                      columns=None)

        # 记录误差值,L2,在每次画loss前更新(以防中期停止程序)
        def gapReocord():
            R_variable['final_train_loss'] = R_variable['loss_train'][-1]
            R_variable['final_test_loss'] = R_variable['loss_test'][-1]
            R_variable['final_train_acc'] = R_variable['acc_train'][-1]
            R_variable['final_test_acc'] = R_variable['acc_test'][-1]

        # 储存误差
        R_variable['loss_test'] = []
        R_variable['loss_train'] = []
        R_variable['acc_test'] = []
        R_variable['acc_train'] = []

        # 记时,创建新目录
        self.t0 = time.time()
        self.FolderName = mk_newfolder()
        self.x = tf.placeholder(tf.float32,
                                [None].extend(R_variable['graph_shape']),
                                name='x')
        self.y0 = tf.placeholder(tf.float32,
                                 shape=[None
                                        ].extend(R_variable['label_shape']),
                                 name='y0')

        dataset = tf.data.Dataset.from_tensor_slices((self.x, self.y0))
        dataset = dataset.shuffle(20).batch(R_variable['batch_size']).repeat()
        itetator = dataset.make_initializable_iterator()
        data_element = itetator.get_next()

        def weight_variable(shape, name=None):
            initial = tf.truncated_normal(shape, stddev=0.1)
            return tf.Variable(initial, name=name)

        def bias_variable(shape, name=None):
            initial = tf.constant(0.1, shape=shape)
            return tf.Variable(initial, name=name)

        def activation_fun(x, name=None):
            if (R_variable['ActFuc'] == 'relu'):
                z = tf.nn.relu(x, name=name)
            if (R_variable['ActFuc'] == 'tanh'):
                z = tf.nn.tanh(x, name=name)
            if (R_variable['ActFuc'] == 'srelu'):
                z = tf.nn.relu(-(x - 1)) * tf.nn.relu(x)
            return z

        # 全连接层
        x_flat = tf.reshape(
            self.x,
            [-1, R_variable['graph_shape'][0] * R_variable['graph_shape'][1]],
            name='x_flat')
        W_fc1 = weight_variable(
            [R_variable['graph_shape'][0] * R_variable['graph_shape'][1], 800],
            name='W_fc1')
        b_fc1 = bias_variable([800], name='b_fc1')
        h_fc1 = activation_fun(tf.matmul(x_flat, W_fc1) + b_fc1, name='h_fc1')

        # 全连接层
        W_fc2 = weight_variable([800, 800], name='W_fc2')
        b_fc2 = bias_variable([800], name='b_fc2')
        h_fc2 = activation_fun(tf.matmul(h_fc1, W_fc2) + b_fc2, name='h_fc2')

        # 全连接层
        W_fc3 = weight_variable([800, 512], name='W_fc3')
        b_fc3 = bias_variable([512], name='b_fc3')
        h_fc3 = activation_fun(tf.matmul(h_fc2, W_fc3) + b_fc3, name='h_fc3')

        # 全连接层
        W_fc4 = weight_variable([512, 64], name='W_fc4')
        b_fc4 = bias_variable([64], name='b_fc4')
        h_fc4 = activation_fun(tf.matmul(h_fc3, W_fc4) + b_fc4, name='h_fc4')

        # softmax
        W_fc5 = weight_variable([64, 10], name='W_fc5')
        b_fc5 = bias_variable([10], name='b_fc3')
        y_pre = tf.add(tf.matmul(h_fc4, W_fc5), b_fc5, name='y_pre')
        self.y = tf.nn.softmax(y_pre, name='y')

        # loss func
        self.cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=self.y,
                                                    labels=self.y0))
        # train aim
        self.train = tf.train.AdamOptimizer(
            learning_rate=R_variable['learning_rate']).minimize(
                self.cross_entropy)
        # accuracy
        self.result = tf.equal(tf.argmax(self.y, 1), tf.argmax(self.y0, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.result, tf.float32))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True  #服务器跑,可忽略
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(itetator.initializer,
                      feed_dict={
                          self.x: R_variable['train_inputs'],
                          self.y0: R_variable['y_true_train']
                      })

        for e in range(R_variable['epoch']):
            for s in range(R_variable['batch_num']):
                finished_batch = e * R_variable['batch_num'] + s + 1
                # training
                x_batch, y_batch = self.sess.run(data_element)
                self.sess.run(self.train,
                              feed_dict={
                                  self.x: x_batch,
                                  self.y0: y_batch
                              })
                acc_Test, loss_Test = self.sess.run(
                    [self.accuracy, self.cross_entropy],
                    feed_dict={
                        self.x: R_variable['test_inputs'],
                        self.y0: R_variable['y_true_test']
                    })
                if (acc_Test >= R_variable['breakstandard']):
                    R_variable['uesd batch'] = finished_batch
                    R_variable['uesd time'] = time.time() - self.t0
                    R_variable['flag'] = 1
                    break

                if s % 1000 == 0:
                    acc_Train, loss_Train = self.sess.run(
                        [self.accuracy, self.cross_entropy],
                        feed_dict={
                            self.x: x_batch,
                            self.y0: y_batch
                        })
                    acc_Test, loss_Test = self.sess.run(
                        [self.accuracy, self.cross_entropy],
                        feed_dict={
                            self.x: R_variable['test_inputs'],
                            self.y0: R_variable['y_true_test']
                        })
                    # R_variable['loss_train'].append(loss_Train)
                    # R_variable['loss_test'].append(loss_Test)
                    batch_needed = R_variable['epoch'] * R_variable[
                        'batch_num'] - finished_batch
                    round_time = time.time()
                    R_variable['use_time'] = round_time - self.t0
                    time_needed = (round_time -
                                   self.t0) / finished_batch * batch_needed
                    print(
                        "In epoch: %d, step: %d, Train accuracy is: %3.3f, Train loss is: %3.3f"
                        % (e + 1, s, acc_Train, loss_Train))
                    print("Test accuracy is: %3.3f, Test loss is: %3.3f" %
                          (acc_Test, loss_Test))
                    print(
                        "The program have been running for %ds, still need %ds"
                        % (round_time - self.t0, time_needed))
                    # savefile()
                    # gapReocord()

            acc_Train, loss_Train = self.sess.run(
                [self.accuracy, self.cross_entropy],
                feed_dict={
                    self.x: R_variable['train_inputs'],
                    self.y0: R_variable['y_true_train']
                })
            acc_Test, loss_Test = self.sess.run(
                [self.accuracy, self.cross_entropy],
                feed_dict={
                    self.x: R_variable['test_inputs'],
                    self.y0: R_variable['y_true_test']
                })
            R_variable['loss_train'].append(loss_Train)
            R_variable['loss_test'].append(loss_Test)
            R_variable['acc_train'].append(acc_Train)
            R_variable['acc_test'].append(acc_Test)
            savefile()
            gapReocord()
            plotloss()
            plotacc()
            if (R_variable['flag'] == 1):
                break

        print("Program ends. ")
        print("Train accuracy is: %3.3f, Train loss is: %3.3f" %
              (acc_Train, loss_Train))
        print("Test accuracy is: %3.3f, Test loss is: %3.3f" %
              (acc_Test, loss_Test))
        print("The program have been running for %ds." %
              (round_time - self.t0))
from get_coordinates import get_coordinates
from PIL import Image
import pandas as pd
from show_image import show_image_objects
import os
from kerasretinanet.keras_retinanet import models
from kerasretinanet.keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image
from kerasretinanet.keras_retinanet.utils.visualization import draw_box, draw_caption
from kerasretinanet.keras_retinanet.utils.colors import label_color
import cv2
import matplotlib.pyplot as plt
import numpy as np

#some fixes so we can train model
import tensorflow.compat.v1 as tf1
config = tf1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf1.InteractiveSession(config=config)

#prepare test pictures and annotations
pic_list = [
    p for p in pathlib.Path('AWEForSegmentation/testannot_rect').iterdir()
    if p.is_file()
]
dataset = dict()
dataset['img_name'] = list()
dataset['x_min'] = list()
dataset['y_min'] = list()
dataset['x_max'] = list()
dataset['y_max'] = list()
dataset['class_name'] = list()
Beispiel #26
0
def train(params):
    """Entry point for training."""
    with gfile.GFile(params.data_path, 'rb') as finp:
        x_train, x_valid, x_test, _, _ = pickle.load(finp)
        print('-' * 80)
        print('train_size: {0}'.format(np.size(x_train)))
        print('valid_size: {0}'.format(np.size(x_valid)))
        print(' test_size: {0}'.format(np.size(x_test)))

    g = tf.Graph()
    with g.as_default():
        tf.random.set_random_seed(2126)
        ops = get_ops(params, x_train, x_valid, x_test)
        run_ops = [
            ops['train_loss'],
            ops['grad_norm'],
            ops['learning_rate'],
            ops['should_reset'],
            ops['moving_avg_started'],
            ops['train_op'],
        ]

        saver = tf.train.Saver(max_to_keep=2)
        checkpoint_saver_hook = tf.train.CheckpointSaverHook(
            params.output_dir,
            save_steps=params.num_train_batches,
            saver=saver)
        hooks = [checkpoint_saver_hook]
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.train.SingularMonitoredSession(
            config=config, hooks=hooks, checkpoint_dir=params.output_dir)
        accum_loss = 0.
        accum_step = 0
        best_valid_ppl = []
        start_time = time.time()
        while True:
            try:
                loss, gn, lr, should_reset, moving_avg_started, _ = sess.run(
                    run_ops)
                accum_loss += loss
                accum_step += 1
                step = sess.run(ops['global_step'])
                if step % params.log_every == 0:
                    epoch = step // params.num_train_batches
                    train_ppl = np.exp(accum_loss / accum_step)
                    mins_so_far = (time.time() - start_time) / 60.
                    log_string = 'epoch={0:<5d}'.format(epoch)
                    log_string += ' step={0:<7d}'.format(step)
                    log_string += ' ppl={0:<10.2f}'.format(train_ppl)
                    log_string += ' lr={0:<6.3f}'.format(lr)
                    log_string += ' |g|={0:<6.3f}'.format(gn)
                    log_string += ' avg={0:<2d}'.format(moving_avg_started)
                    log_string += ' mins={0:<.2f}'.format(mins_so_far)
                    print(log_string)

                if moving_avg_started:
                    sess.run(ops['update_moving_avg'])

                # if step % params.num_train_batches == 0:
                if should_reset:
                    sess.run(ops['reset_batch_states'])
                    accum_loss = 0
                    accum_step = 0
                    valid_ppl = ops['eval_valid'](
                        sess, use_moving_avg=moving_avg_started)
                    sess.run(
                        [ops['reset_batch_states'], ops['reset_start_idx']])
                    if (not moving_avg_started and len(best_valid_ppl) >
                            params.best_valid_ppl_threshold and valid_ppl >
                            min(best_valid_ppl[:-params.
                                               best_valid_ppl_threshold])):
                        print('Starting moving_avg')
                        sess.run(ops['start_moving_avg'])
                    best_valid_ppl.append(valid_ppl)

                if step >= params.num_train_steps:
                    ops['eval_test'](sess, use_moving_avg=moving_avg_started)
                    break
            except tf.errors.InvalidArgumentError:
                last_checkpoint = tf.train.latest_checkpoint(params.output_dir)
                print('rolling back to previous checkpoint {0}'.format(
                    last_checkpoint))
                saver.restore(sess, last_checkpoint)
                accum_loss, accum_step = 0., 0
        sess.close()
Beispiel #27
0
def build_graph(bert_config,
                opts,
                iterations_per_step=1,
                is_training=True,
                feed_name=None):
    """Build the graph for training.

    Args:
        bert_config: configuration for the BERT model.
        opts: a dictionary containing all global options.
        iterations_per_step: number of iterations per step
        is_training (bool): if true return a graph with trainable variables.
        feed_name: name of the IPU infeed.

    Returns:
        a GraphOps containing a BERT graph and session prepared for inference or training.
    """
    train_graph = tf.Graph()
    with train_graph.as_default():

        placeholders = dict()
        placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                       shape=[])
        learning_rate = placeholders['learning_rate']

        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])

        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                bert_config,
                train_iterator,
                outfeed_queue,
                opts,
                learning_rate,
                iterations_per_step,
                is_training=is_training)

        outfeed = outfeed_queue.dequeue()

        bert_logging.print_trainable_variables(opts['logs_path'])

        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        model_and_optimiser_variables = tf.global_variables()

        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_ckpt'] else model_variables)

        # We store two savers: one for the standard training and another one for the best checkpoint
        savers = {
            "train_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='latest',
                           max_to_keep=5),
            "best_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='best',
                           max_to_keep=1)
        }

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # Calculate number of IPUs required for pretraining pipeline.
    num_embedding_ipu = {
        'two_ipus': 2,
        'same_ipu': 1,
        'same_as_hidden_layers': 0
    }[opts['embeddings_placement']]

    num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage)
    num_ipus_required = opts['replicas'] * next_power_of_two(
        num_hidden_layer_stages + num_embedding_ipu)

    # Configure the IPU options.
    ipu_options = get_ipu_config(
        fp_exceptions=opts["fp_exceptions"],
        stochastic_rounding=opts['stochastic_rounding'],
        xla_recompute=opts["xla_recompute"],
        available_memory_proportion=opts['available_memory_proportion'],
        disable_graph_outlining=opts["disable_graph_outlining"],
        num_ipus_required=num_ipus_required,
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        partials_type=opts['partials_type'])
    ipu.utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, savers, restore, tvars)
Beispiel #28
0
def main(_):
    # We want to see all the logging messages for this tutorial.
    tf.logging.set_verbosity(tf.logging.INFO)
    np.set_printoptions(threshold=np.inf, linewidth=10000)

    flags = vars(FLAGS)
    for key in sorted(flags.keys()):
        tf.logging.info('%s = %s', key, flags[key])

    # Start a new TensorFlow session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True
    #config.log_device_placement = False
    sess = tf.InteractiveSession(config=config)

    label_file = os.path.join(os.path.dirname(FLAGS.start_checkpoint),
                              "vgg_labels.txt")
    fid = open(label_file)
    labels = []
    for line in fid:
        labels.append(line.rstrip())
    label_count = len(labels)
    fid.close()

    model_settings = models.prepare_model_settings(
        label_count, FLAGS.sample_rate, FLAGS.nchannels,
        FLAGS.clip_duration_ms, FLAGS.representation, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, 1, FLAGS.dct_coefficient_count,
        FLAGS.filterbank_channel_count,
        [int(x) for x in FLAGS.filter_counts.split(',')],
        [int(x)
         for x in FLAGS.filter_sizes.split(',')], FLAGS.final_filter_len,
        FLAGS.dropout_prob, FLAGS.batch_size, FLAGS.dilate_after_layer,
        FLAGS.stride_after_layer, FLAGS.connection_type)

    fingerprint_size = model_settings['fingerprint_size']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    hidden, logits = models.create_model(fingerprint_input,
                                         model_settings,
                                         FLAGS.model_architecture,
                                         is_training=False)

    tf.global_variables_initializer().run()

    models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)

    total_parameters = 0
    for variable in tf.trainable_variables():
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= int(dim)
        total_parameters += variable_parameters
    tf.logging.info('number of trainable parameters: %d', total_parameters)

    audio_processor = input_data.AudioProcessor(
        FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
        FLAGS.unknown_percentage, FLAGS.wanted_words.split(','),
        FLAGS.labels_touse.split(','),
        FLAGS.validation_percentage, FLAGS.validation_offset_percentage,
        FLAGS.validation_files.split(','), FLAGS.testing_percentage,
        FLAGS.testing_files.split(','), FLAGS.subsample_skip,
        FLAGS.subsample_word, FLAGS.partition_word, FLAGS.partition_n,
        FLAGS.partition_training_files.split(','),
        FLAGS.partition_validation_files.split(','), -1,
        FLAGS.testing_equalize_ratio, FLAGS.testing_max_samples,
        model_settings)

    testing_set_size = audio_processor.set_size('testing')

    for isample in xrange(0, testing_set_size, FLAGS.batch_size):
        fingerprints, _, samples = (audio_processor.get_data(
            FLAGS.batch_size, isample, model_settings, 0.0, 0.0,
            0.0 if FLAGS.time_shift_random else time_shift_samples,
            FLAGS.time_shift_random, 'testing', sess))
        needed = FLAGS.batch_size - fingerprints.shape[0]
        if needed > 0:
            fingerprints = np.append(fingerprints,
                                     np.repeat(fingerprints[[0], :],
                                               needed,
                                               axis=0),
                                     axis=0)
            for _ in range(needed):
                samples.append(samples[0])
        logit_vals, hidden_vals = sess.run([logits, hidden],
                                           feed_dict={
                                               fingerprint_input: fingerprints,
                                           })
        batch_size = min(FLAGS.batch_size, testing_set_size - isample)
        obtained = FLAGS.batch_size - needed
        if isample == 0:
            samples_data = [None] * testing_set_size
        samples_data[isample:isample + obtained] = samples[:obtained]
        if FLAGS.save_activations:
            if isample == 0:
                activations = []
                for ihidden in range(len(hidden_vals)):
                    nHWC = np.shape(hidden_vals[ihidden])[1:]
                    activations.append(np.empty((testing_set_size, *nHWC)))
                activations.append(
                    np.empty((testing_set_size, np.shape(logit_vals)[1])))
            for ihidden in range(len(hidden_vals)):
                activations[ihidden][isample:isample+obtained,:,:] = \
                      hidden_vals[ihidden][:obtained,:,:,:]
            activations[-1][isample:isample +
                            obtained, :] = logit_vals[:obtained, :]
        if FLAGS.save_fingerprints:
            if isample == 0:
                nW = round((FLAGS.clip_duration_ms - FLAGS.window_size_ms) / \
                           FLAGS.window_stride_ms + 1)
                nH = round(np.shape(fingerprints)[1] / nW)
                input_layer = np.empty((testing_set_size, nW, nH))
            input_layer[isample:isample+obtained,:,:] = \
                  np.reshape(fingerprints[:obtained,:],(obtained,nW,nH))
    if FLAGS.save_activations:
        np.savez(os.path.join(FLAGS.data_dir,'activations.npz'), \
                 *activations, samples=samples_data, labels=labels)
    if FLAGS.save_fingerprints:
        np.save(os.path.join(FLAGS.data_dir, 'fingerprints.npy'), input_layer)
def main():
    args = parser.parse_args()
    enc = encoder.get_encoder(args.model_name)
    hparams = model.default_hparams()
    with open(os.path.join('..//models', args.model_name,
                           'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if args.sample_length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         hparams.n_ctx)

    if args.model_name == '345M':
        args.memory_saving_gradients = True
        if args.optimizer == 'adam':
            args.only_train_transformer_layers = True

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF
    with tf.Session(config=config) as sess:
        context = tf.placeholder(tf.int32, [args.batch_size, None])
        context_in = randomize(context, hparams, args.noise)
        output = model.model(hparams=hparams, X=context_in)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=context[:, 1:], logits=output['logits'][:, :-1]))

        if args.val_every > 0:
            val_context = tf.placeholder(tf.int32, [args.val_batch_size, None])
            val_output = model.model(hparams=hparams, X=val_context)
            val_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=val_context[:, 1:],
                    logits=val_output['logits'][:, :-1]))
            val_loss_summary = tf.summary.scalar('val_loss', val_loss)

        tf_sample = sample.sample_sequence(hparams=hparams,
                                           length=args.sample_length,
                                           context=context,
                                           batch_size=args.batch_size,
                                           temperature=1.0,
                                           top_k=args.top_k,
                                           top_p=args.top_p)

        all_vars = [v for v in tf.trainable_variables() if 'model' in v.name]
        train_vars = [v for v in all_vars if '/h' in v.name
                      ] if args.only_train_transformer_layers else all_vars

        if args.optimizer == 'adam':
            opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
        elif args.optimizer == 'sgd':
            opt = tf.train.GradientDescentOptimizer(
                learning_rate=args.learning_rate)
        else:
            exit('Bad optimizer:', args.optimizer)

        if args.accumulate_gradients > 1:
            if args.memory_saving_gradients:
                exit(
                    "Memory saving gradients are not implemented for gradient accumulation yet."
                )
            opt = AccumulatingOptimizer(opt=opt, var_list=train_vars)
            opt_reset = opt.reset()
            opt_compute = opt.compute_gradients(loss)
            opt_apply = opt.apply_gradients()
            summary_loss = tf.summary.scalar('loss', opt_apply)
        else:
            if args.memory_saving_gradients:
                opt_grads = memory_saving_gradients.gradients(loss, train_vars)
            else:
                opt_grads = tf.gradients(loss, train_vars)
            opt_grads = list(zip(opt_grads, train_vars))
            opt_apply = opt.apply_gradients(opt_grads)
            summary_loss = tf.summary.scalar('loss', loss)

        summary_lr = tf.summary.scalar('learning_rate', args.learning_rate)
        summaries = tf.summary.merge([summary_lr, summary_loss])

        summary_log = tf.summary.FileWriter(
            os.path.join(CHECKPOINT_DIR, args.run_name))

        saver = tf.train.Saver(var_list=all_vars,
                               max_to_keep=5,
                               keep_checkpoint_every_n_hours=2)
        sess.run(tf.global_variables_initializer())

        if args.restore_from == 'latest':
            ckpt = tf.train.latest_checkpoint(
                os.path.join(CHECKPOINT_DIR, args.run_name))
            if ckpt is None:
                # Get fresh GPT weights if new run.
                ckpt = tf.train.latest_checkpoint(
                    os.path.join('..//models', args.model_name))
        elif args.restore_from == 'fresh':
            ckpt = tf.train.latest_checkpoint(
                os.path.join('..//models', args.model_name))
        else:
            ckpt = tf.train.latest_checkpoint(args.restore_from)
        print('Loading checkpoint', ckpt)
        saver.restore(sess, ckpt)

        print('Loading dataset...')
        chunks = load_dataset(enc,
                              args.dataset,
                              args.combine,
                              encoding=args.encoding)
        data_sampler = Sampler(chunks)
        if args.val_every > 0:
            if args.val_dataset:
                val_chunks = load_dataset(enc,
                                          args.val_dataset,
                                          args.combine,
                                          encoding=args.encoding)
            else:
                val_chunks = chunks
        print('dataset has', data_sampler.total_size, 'tokens')
        print('Training...')

        if args.val_every > 0:
            # Sample from validation set once with fixed seed to make
            # it deterministic during training as well as across runs.
            val_data_sampler = Sampler(val_chunks, seed=1)
            val_batches = [[
                val_data_sampler.sample(1024)
                for _ in range(args.val_batch_size)
            ] for _ in range(args.val_batch_count)]

        counter = 1
        counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter')
        if os.path.exists(counter_path):
            # Load the step number if we're resuming a run
            # Add 1 so we don't immediately try to save again
            with open(counter_path, 'r') as fp:
                counter = int(fp.read()) + 1

        def save():
            maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
            print(
                'Saving',
                os.path.join(CHECKPOINT_DIR, args.run_name,
                             'model-{}').format(counter))
            saver.save(sess,
                       os.path.join(CHECKPOINT_DIR, args.run_name,
                                    '..//model'),
                       global_step=counter)
            with open(counter_path, 'w') as fp:
                fp.write(str(counter) + '\n')

        def generate_samples():
            print('Generating samples...')
            context_tokens = data_sampler.sample(1)
            all_text = []
            index = 0
            while index < args.sample_num:
                out = sess.run(
                    tf_sample,
                    feed_dict={context: args.batch_size * [context_tokens]})
                for i in range(min(args.sample_num - index, args.batch_size)):
                    text = enc.decode(out[i])
                    text = '======== SAMPLE {} ========\n{}\n'.format(
                        index + 1, text)
                    all_text.append(text)
                    index += 1
            print(text)
            maketree(os.path.join(SAMPLE_DIR, args.run_name))
            with open(os.path.join(SAMPLE_DIR, args.run_name,
                                   'samples-{}').format(counter),
                      'w',
                      encoding=args.encoding) as fp:
                fp.write('\n'.join(all_text))

        def validation():
            print('Calculating validation loss...')
            losses = []
            for batch in tqdm.tqdm(val_batches):
                losses.append(
                    sess.run(val_loss, feed_dict={val_context: batch}))
            v_val_loss = np.mean(losses)
            v_summary = sess.run(val_loss_summary,
                                 feed_dict={val_loss: v_val_loss})
            summary_log.add_summary(v_summary, counter)
            summary_log.flush()
            print('[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'.
                  format(counter=counter,
                         time=time.time() - start_time,
                         loss=v_val_loss))

        def sample_batch():
            return [data_sampler.sample(1024) for _ in range(args.batch_size)]

        avg_loss = (0.0, 0.0)
        start_time = time.time()

        try:
            while True:
                if counter % args.save_every == 0:
                    save()
                if counter % args.sample_every == 0:
                    generate_samples()
                if args.val_every > 0 and (counter % args.val_every == 0
                                           or counter == 1):
                    validation()

                if args.accumulate_gradients > 1:
                    sess.run(opt_reset)
                    for _ in range(args.accumulate_gradients):
                        sess.run(opt_compute,
                                 feed_dict={context: sample_batch()})
                    (v_loss, v_summary) = sess.run((opt_apply, summaries))
                else:
                    (_, v_loss, v_summary) = sess.run(
                        (opt_apply, loss, summaries),
                        feed_dict={context: sample_batch()})

                summary_log.add_summary(v_summary, counter)

                avg_loss = (avg_loss[0] * 0.99 + v_loss,
                            avg_loss[1] * 0.99 + 1.0)

                print(
                    '[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
                    .format(counter=counter,
                            time=time.time() - start_time,
                            loss=v_loss,
                            avg=avg_loss[0] / avg_loss[1]))

                counter += 1
        except KeyboardInterrupt:
            print('interrupted')
            save()
def run_training():
    """Train."""
    with tf.Graph().as_default():
        # Input images and labels.
        features = get_features(True, FLAGS.batch_size)
        model = f_model.multi_gpu_model
        print('so far so good!')
        result = model(features)

        # TODO(sasabour): merge jit scopes after jit scopes where enabled.
        merged = result['summary']
        train_step = result['train']
        # test_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/test')

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        saver = tf.train.Saver(max_to_keep=FLAGS.keep_ckpt)
        if tf.gfile.Exists(FLAGS.summary_dir + '/train'):
            ckpt = tf.train.get_checkpoint_state(FLAGS.summary_dir + '/train/')
            print(ckpt)
            if (not FLAGS.restart) and ckpt and ckpt.model_checkpoint_path:
                print('hesllo')
                saver.restore(sess, ckpt.model_checkpoint_path)
                prev_step = int(
                    ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
            else:
                print('what??')
                tf.gfile.DeleteRecursively(FLAGS.summary_dir + '/train')
                tf.gfile.MakeDirs(FLAGS.summary_dir + '/train')
                prev_step = 0
        else:
            tf.gfile.MakeDirs(FLAGS.summary_dir + '/train')
            prev_step = 0
        train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train',
                                             sess.graph)
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            step = 0
            for i in range(prev_step, FLAGS.max_steps):
                step += 1
                summary, _ = sess.run([merged, train_step])
                train_writer.add_summary(summary, i)
                if (i + 1) % FLAGS.checkpoint_steps == 0:
                    saver.save(sess,
                               os.path.join(FLAGS.summary_dir + '/train',
                                            'model.ckpt'),
                               global_step=i + 1)
        except tf.errors.OutOfRangeError:
            print('Done training for %d steps.' % step)
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()
        train_writer.close()
        # Wait for threads to finish.
        coord.join(threads)
        sess.close()