def run_experiment(option, use_basic_dataset): TOKEN_EMB_SIZE = 54 BATCH_SIZE = 128 if use_basic_dataset: sequence_cap = 56 else: sequence_cap = 130 X_SHAPE = (sequence_cap, TOKEN_EMB_SIZE) # set up pipeline print('Setting up data pipeline') NUMBER_BATCHES = 1000 huzzer_kwargs = BASIC_DATASET_ARGS if use_basic_dataset else {} datasource = one_hot_token_random_batcher( BATCH_SIZE, NUMBER_BATCHES, length=sequence_cap, cache_path='simple_models_{}_{}_{}'.format( 'basic' if use_basic_dataset else 'standard', NUMBER_BATCHES, BATCH_SIZE), huzzer_kwargs=huzzer_kwargs) queue = build_single_output_queue(datasource, output_shape=(BATCH_SIZE, sequence_cap, TOKEN_EMB_SIZE), type=tf.uint8) raw_input_sequences = queue.dequeue(name='encoder_input') input_sequences = tf.cast(raw_input_sequences, tf.float32) if option.startswith('simple_'): z_size = int(option.split('_')[-1]) build_simple_network2(input_sequences, X_SHAPE, latent_dim=z_size, kl_limit=0.0) elif option == 'conv': z_size = 128 build_special_conv4_final( input_sequences, X_SHAPE, z_size, filter_length=3, num_filters=128, ) else: print('INVALID OPTION') exit(1) logdir = 'experiments/VAE_baseline/{}{}'.format( 'basic_' if use_basic_dataset else '', option) sv = Supervisor(logdir=logdir, save_summaries_secs=10, save_model_secs=120) # Get a TensorFlow session managed by the supervisor. with sv.managed_session() as sess: # Use the session to train the graph. for i in range(20000): if sv.should_stop(): exit() sess.run('train_on_batch', )
def run_experiment(option, use_basic_dataset): sequence_cap = 56 if use_basic_dataset else 130 print('Setting up data pipeline...') huzzer_kwargs = BASIC_DATASET_ARGS if use_basic_dataset else {} datasource = one_hot_token_random_batcher( BATCH_SIZE, NUMBER_BATCHES, length=sequence_cap, cache_path='attention_models_{}_{}_{}'.format( 'basic' if use_basic_dataset else 'standard', NUMBER_BATCHES, BATCH_SIZE), huzzer_kwargs=huzzer_kwargs) queue = build_single_output_queue(datasource, output_shape=(BATCH_SIZE, sequence_cap, TOKEN_EMB_SIZE), type=tf.uint8) raw_input_sequences = queue.dequeue(name='input_sequence') sequence_lengths = get_sequence_lengths( tf.cast(raw_input_sequences, tf.int32)) input_sequences = tf.cast(raw_input_sequences, tf.float32) print('Building model..') if option.startswith('attention1'): z_size = int(option.split('_')[-1]) encoder_output = build_single_program_encoder(input_sequences, sequence_lengths, z_size) z_resampled = resampling(encoder_output) decoder_output, _ = build_attention1_decoder(z_resampled, sequence_lengths, sequence_cap, TOKEN_EMB_SIZE) cross_entropy_loss = tf.reduce_mean( ce_loss_for_sequence_batch(decoder_output, input_sequences, sequence_lengths, sequence_cap)) kl_loss = tf.reduce_mean(kl_divergence(encoder_output)) else: print('INVALID OPTION') exit(1) total_loss_op = kl_loss + cross_entropy_loss tf.summary.scalar('cross_entropy_loss', cross_entropy_loss) tf.summary.scalar('kl_loss', kl_loss) tf.summary.scalar('total_loss', total_loss_op) logdir = os.path.join(BASEDIR, ('basic_' if use_basic_dataset else '') + option) optimizer = tf.train.AdamOptimizer(1e-3) print('creating train op...') train_op = slim.learning.create_train_op(total_loss_op, optimizer) print('starting supervisor...') sv = Supervisor(logdir=logdir, save_model_secs=300, save_summaries_secs=60) print('training...') with sv.managed_session() as sess: while not sv.should_stop(): total_loss, _ = sess.run([total_loss_op, train_op])
def run_experiment(option): BATCH_SIZE = 128 X_SHAPE = (128, 54) # set up pipeline print('Setting up data pipeline') data_pipeline = one_hot_token_pipeline(for_cnn=False, length=128) # Function to pass into queue batch_index = 0 def get_batch(): nonlocal batch_index if batch_index % 100 == 0: logging.info('{} examples used'.format(batch_index * BATCH_SIZE)) code_seeds = [ str(i) for i in range(batch_index * BATCH_SIZE, (batch_index + 1) * BATCH_SIZE) ] batch = np.array(data_pipeline[code_seeds]) batch_index += 1 return batch # use the queue for training queue = build_single_output_queue(get_batch, (BATCH_SIZE, *X_SHAPE)) x = queue.dequeue(name='encoder_input') if option == 'simple': print('this no longer works: - a small refactor would work') tensor_names = build_simple_network(x, BATCH_SIZE, (256, 54)) elif option == 'conv1': tensor_names = build_conv1(x, (128, 54)) elif option == 'conv2': tensor_names = build_conv2(x, (128, 54), 32) elif option == 'conv3': tensor_names = build_conv3(x, (128, 54), 64) elif option == 'conv4': tensor_names = build_conv4(x, (128, 54), 64) else: print('INVALID OPTION') exit(1) logdir = 'experiments/VAE_baseline/{}'.format(option) sv = Supervisor(logdir=logdir, save_summaries_secs=20, save_model_secs=120) # Get a TensorFlow session managed by the supervisor. with sv.managed_session() as sess: # Use the session to train the graph. while not sv.should_stop(): sess.run('train_on_batch', )
def run_experiment(option): BATCH_SIZE = 32 if option == 'mnist_digits': gen = mnist_unlabeled_generator(BATCH_SIZE, for_cnn=True) batch_shape = gen()[0].shape print('batch shape is : {}'.format(batch_shape)) get_batch = lambda: gen()[0] queue = build_single_output_queue(get_batch, batch_shape) x = queue.dequeue(name='real_input') training_ops = build_mnist_gan_for_training(x) else: print('INVALID OPTION') exit(1) logdir = 'experiments/GAN_baseline/{}'.format(option) sv = Supervisor(logdir=logdir, save_summaries_secs=20, save_model_secs=120) # Get a TensorFlow session managed by the supervisor. with sv.managed_session() as sess: # Use the session to train the graph. d_loss = 5 g_loss = 4 steps_without_d_training = 0 i = 0 while not sv.should_stop(): # if d_loss > 0.5 or steps_without_d_training > 50: if d_loss > g_loss or i > 6000: steps_without_d_training = 0 d_loss = sess.run(training_ops['train_discriminator']) # else: # steps_without_d_training += 1 # sess.run(training_ops['train_discriminator']) g_loss = sess.run(training_ops['train_generator'])
def run_experiment(option): BATCH_SIZE = 128 X_SHAPE = (128, 54) # set up pipeline print('Setting up data pipeline') NUMBER_BATCHES = 500 dataset = one_hot_token_dataset( BATCH_SIZE, NUMBER_BATCHES, length=128, cache_path='one_hot_token_haskell_batch{}_number{}'.format(BATCH_SIZE, NUMBER_BATCHES) ) def get_batch(): return dataset()[0] # use the queue for training queue = build_single_output_queue(get_batch, (BATCH_SIZE, *X_SHAPE)) x = queue.dequeue(name='encoder_input') if option == 'simple': tensor_names = build_simple_network2(x, X_SHAPE, 32) elif option == 'simple_double_latent': tensor_names = build_simple_network2(x, X_SHAPE, 64) elif option == 'simple_256': tensor_names = build_simple_network2(x, X_SHAPE, 256) elif option == 'simple_1024': tensor_names = build_simple_network2(x, X_SHAPE, 1024) elif option == 'simple_8192': tensor_names = build_simple_network2(x, X_SHAPE, 8192) elif option == 'conv_special': tensor_names = build_special_conv(x, X_SHAPE, 64) elif option == 'conv_special_low_kl': tensor_names = build_special_conv_low_kl(x, X_SHAPE, 64) elif option == 'conv_special2': tensor_names = build_special_conv2(x, X_SHAPE, 64) elif option == 'conv_special2_l1': tensor_names = build_special_conv2_l1(x, X_SHAPE, 64) elif option == 'conv_special2_l1_128': tensor_names = build_special_conv2_l1(x, X_SHAPE, 128) # conv3 is conv2 but with initial filter length of 5 instead of 1 elif option == 'conv_special3_l1_128': tensor_names = build_special_conv2_l1(x, X_SHAPE, 128, filter_length=5) elif option == 'conv_special3_l1_256': tensor_names = build_special_conv2_l1(x, X_SHAPE, 256, filter_length=5) elif option == 'conv_special3_l1_128f_256': tensor_names = build_special_conv2_l1(x, X_SHAPE, 256, filter_length=5, num_filters=128) elif option == 'conv_special3_big_l1_512': tensor_names = build_special_conv2_l1(x, X_SHAPE, 512, filter_length=10) elif option == 'conv_special4_l1_1024': tensor_names = build_special_conv4_l1(x, X_SHAPE, 1024, filter_length=3, num_filters=256) elif option == 'conv_special4_l1_2048_f5': tensor_names = build_special_conv4_l1(x, X_SHAPE, 1024, filter_length=5, num_filters=256) else: print('INVALID OPTION') exit(1) logdir = 'experiments/VAE_baseline/{}_sss'.format(option) sv = Supervisor( logdir=logdir, save_summaries_secs=20, save_model_secs=120 ) # Get a TensorFlow session managed by the supervisor. with sv.managed_session() as sess: # Use the session to train the graph. while not sv.should_stop(): sess.run( 'train_on_batch', )
def run(hps): train_images, _ = images(hps) hps.image_size = validate_and_get_image_size(train_images) # To avoid error due to GraphDef being over 2GB # (https://www.tensorflow.org/guide/datasets#consuming_numpy_arrays): images_placeholder = tf.placeholder(train_images.dtype, train_images.shape) iterator = tf.data.Dataset.from_tensor_slices(images_placeholder). \ shuffle(10000, reshuffle_each_iteration=True).repeat(). \ batch(batch_size=hps.batch_size).make_initializable_iterator() with tf.variable_scope("model", reuse=tf.AUTO_REUSE): x = tf.reshape(iterator.get_next(), (-1, 3, *hps.image_size)) # Data-dependent initialization causes freeze during cycle detection (TF bug?): # hps.num_gpus = 1 # init_x = x[:hps.batch_size, :, :, :] # init_model = CVAE1(hps, "init", init_x) # vs.reuse_variables() hps.num_gpus = FLAGS.num_gpus model = CVAE1(hps, "train", x) saver = tf.train.Saver(max_to_keep=2) total_size = 0 for v in tf.trainable_variables(): total_size += np.prod([int(s) for s in v.get_shape()]) print("Num trainable variables: %d" % total_size) init_op = tf.global_variables_initializer() def init_fn(ses): print("Initializing parameters.") ses.run(iterator.initializer, feed_dict={images_placeholder: train_images}) # XXX(rafal): TensorFlow bug?? Default initializer should handle things well.. # ses.run(init_model.h_top.initializer) ses.run(init_op) print("Initialized!") sv = Supervisor( is_chief=True, logdir=FLAGS.logdir + "/train/{}_{}".format(strftime('%Y%m%d-%H%M%S'), FLAGS.hpconfig), summary_op=None, # Automatic summaries don"t work with placeholders. saver=saver, global_step=model.global_step, save_summaries_secs=120, save_model_secs=0, init_op=None, init_fn=init_fn) print("starting training") local_step = 0 begin = time.time() config = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config) as sess: print("Running first iteration!") while not sv.should_stop(): fetches = [ model.bits_per_dim, model.global_step, model.dec_log_stdv, model.train_op ] should_compute_summary = (local_step % 20 == 19) if should_compute_summary: fetches += [model.summary_op] fetched = sess.run(fetches) if should_compute_summary: sv.summary_computed(sess, fetched[-1]) if local_step < 10 or should_compute_summary: print( "Iteration %d, time = %.2fs, train bits_per_dim = %.4f, dec_log_stdv = %.4f" % (fetched[1], time.time() - begin, fetched[0], fetched[2])) begin = time.time() if np.isnan(fetched[0]): print("NAN detected!") break if local_step % 3000 == 0: saver.save(sess, sv.save_path, global_step=sv.global_step, write_meta_graph=False) local_step += 1 sv.stop()
def run_experiment(option, use_basic_dataset): assert os.path.isdir(os.path.join(BASEDIR, 'pretrained_weights')), 'weights files are missing' sequence_cap = 56 if use_basic_dataset else 130 print('Setting up data pipeline...') huzzer_kwargs = BASIC_DATASET_ARGS if use_basic_dataset else {} datasource = one_hot_token_random_batcher( BATCH_SIZE, NUMBER_BATCHES, length=sequence_cap, cache_path='attention_models_{}_{}_{}'.format( 'basic' if use_basic_dataset else 'standard', NUMBER_BATCHES, BATCH_SIZE ), huzzer_kwargs=huzzer_kwargs ) queue = build_single_output_queue( datasource, output_shape=(BATCH_SIZE, sequence_cap, TOKEN_EMB_SIZE), type=tf.uint8 ) raw_input_sequences = queue.dequeue(name='input_sequence') real_sequence_lengths = get_sequence_lengths( tf.cast(raw_input_sequences, tf.int32) ) real_input_sequences = tf.cast(raw_input_sequences, tf.float32) print('Building model..') if option.startswith('attention1_gan_no_pretrain'): z_size = int(option.split('_')[-1]) random_vector = tf.random_normal( dtype=tf.float32, shape=[BATCH_SIZE, z_size], mean=0, stddev=0.1 # because that is what we will used when generating ) # we do not know the length of the generated code beforehand, so we pass in # sequence lengths of `sequence_cap` full_lengths = tf.constant( [sequence_cap for _ in range(BATCH_SIZE)], dtype=tf.float32, name='generator_lengths' ) # create the scaling const. k_t k_t = tf.Variable(0., trainable=False, name='k_t') # generator gets restored weights, and so does the with tf.variable_scope('generator'): unnormalized_generated_programs, _ = build_attention1_decoder( random_vector, full_lengths, sequence_cap, TOKEN_EMB_SIZE ) generated_programs = tf.nn.softmax( unnormalized_generated_programs, dim=-1, name='generated_programs' ) generated_lengths = get_sequence_lengths(generated_programs, epsilon=0.01) with tf.variable_scope('discriminator'): sequence_lengths = tf.concat([generated_lengths, real_sequence_lengths], axis=0) encoder_output = build_single_program_encoder( tf.concat([generated_programs, real_input_sequences], axis=0), sequence_lengths, z_size ) # get the values corresponding to mus from the encoder output_shape assert encoder_output.get_shape()[1].value == 2 * z_size encoded_v = encoder_output[:, :z_size] reconstructed, _ = build_attention1_decoder( encoded_v, sequence_lengths, sequence_cap, TOKEN_EMB_SIZE ) # these are the unnormalized_token_probs for g and d generated_reconstructed = reconstructed[:BATCH_SIZE] real_reconstructed = reconstructed[BATCH_SIZE:] generator_loss = tf.reduce_mean( ce_loss_for_sequence_batch( unnormalized_token_probs=generated_reconstructed, input_sequences=generated_programs, sequence_lengths=generated_lengths, max_length=sequence_cap ) ) real_loss = tf.reduce_mean( ce_loss_for_sequence_batch( unnormalized_token_probs=real_reconstructed, input_sequences=real_input_sequences, sequence_lengths=generated_lengths, max_length=sequence_cap ) ) discriminator_loss = real_loss - (k_t * generator_loss) optimizer = tf.train.AdamOptimizer(1e-5) print('creating discriminator train op...') d_train_op = slim.learning.create_train_op(discriminator_loss, optimizer) optimizer = tf.train.AdamOptimizer(1e-5) print('creating generator train op...') g_train_op = slim.learning.create_train_op(generator_loss, optimizer) balance = GAMMA * real_loss - generator_loss measure = real_loss + tf.abs(balance) # update k_t with tf.control_dependencies([d_train_op, g_train_op]): k_update = tf.assign( k_t, tf.clip_by_value(k_t + LAMBDA * balance, 0, 1)) # example_summary_op = tf.summary.merge([ # tf.summary.image("G", tf.expand_dims(generated_programs, -1)), # tf.summary.image("AE_G", tf.expand_dims( # tf.nn.softmax(generated_reconstructed, dim=-1), axis=-1 # )), # tf.summary.image("AE_x", tf.expand_dims( # tf.nn.softmax(real_reconstructed, dim=-1), axis=-1 # )) # ]) perf_summary_op = tf.summary.merge([ tf.summary.scalar("loss/discriminator_loss", discriminator_loss), tf.summary.scalar("loss/real_loss", real_loss), tf.summary.scalar("loss/generator_loss", generator_loss), tf.summary.scalar("misc/measure", measure), tf.summary.scalar("misc/k_t", k_t), tf.summary.scalar("misc/balance", balance), ]) else: print('INVALID OPTION') exit(1) logdir = os.path.join(BASEDIR, ('basic_' if use_basic_dataset else '') + option + '_gan') # build the model and initialise weights so supervisor can start where we left off # if not os.path.isdir(logdir): # mkdir_p(logdir) # with tf.Session() as sess: # print('saving initial pretrained weights') # with tf.variable_scope('', reuse=True): # discriminator_vars = [ # tf.get_variable('discriminator/decoder_fully_connected/bias'), # tf.get_variable('discriminator/decoder_fully_connected/weights'), # tf.get_variable('discriminator/decoder_rnn/lstm_cell/biases'), # tf.get_variable('discriminator/decoder_rnn/lstm_cell/weights'), # tf.get_variable('discriminator/rnn/lstm_cell/biases'), # tf.get_variable('discriminator/rnn/lstm_cell/weights'), # tf.get_variable('discriminator/simple_attention/bias'), # tf.get_variable('discriminator/simple_attention/weights'), # ] # generator_vars = [ # tf.get_variable('generator/decoder_fully_connected/bias'), # tf.get_variable('generator/decoder_fully_connected/weights'), # tf.get_variable('generator/decoder_rnn/lstm_cell/biases'), # tf.get_variable('generator/decoder_rnn/lstm_cell/weights'), # tf.get_variable('generator/simple_attention/bias'), # tf.get_variable('generator/simple_attention/weights'), # ] # # discriminator_saver = tf.train.Saver( # discriminator_vars # ) # generator_saver = tf.train.Saver( # generator_vars # ) # sess.run(tf.global_variables_initializer()) # discriminator_saver.restore( # sess, # os.path.join(BASEDIR, 'pretrained_weights', 'discriminator_weights.cpkt') # ) # generator_saver.restore( # sess, # os.path.join(BASEDIR, 'pretrained_weights', 'generator_weights.cpkt') # ) # # saver = tf.train.Saver() # saver.save(sess, os.path.join(logdir, 'model.cpkt-0')) print('starting supervisor...') sv = Supervisor( logdir=logdir, save_model_secs=300, save_summaries_secs=60, summary_op=perf_summary_op ) print('training...') with sv.managed_session() as sess: global_step = -1 while not sv.should_stop(): ops = { 'k_update': k_update, 'measure': measure, 'd_train_op': d_train_op, 'g_train_op': g_train_op, 'global_step': sv.global_step } # if global_step % 200 == 0: # ops.update({'images': example_summary_op}) results = sess.run(ops)
def __run(build_model): cfg = gflags.cfg # ============ Class balance # assert class_balance in [None, 'median_freq_cost', 'rare_freq_cost'], ( # 'The balance class method is not implemented') # if class_balance in ['median_freq_cost', 'rare_freq_cost']: # if not hasattr(Dataset, 'class_freqs'): # raise RuntimeError('class_freqs is missing for dataset ' # '{}'.format(Dataset.name)) # freqs = Dataset.class_freqs # if class_balance == 'median_freq_cost': # w_freq = np.median(freqs) / freqs # elif class_balance == 'rare_freq_cost': # w_freq = 1 / (cfg.nclasses * freqs) # tf.logging.info("Class balance weights", w_freq) # cfg.class_balance = w_freq # ============ Train/validation # Load data # init_epoch = 0 # prev_history = None # best_loss = np.Inf # best_val = np.Inf if early_stop_strategy == 'min' else -np.Inf # val_metrics_ext = ['val_' + m for m in val_metrics] # history_path = tmp_path + save_name + '.npy' # if cfg.reload_weights: # # Reload weights # pass # BUILD GRAPH tf_config = tf.ConfigProto(allow_soft_placement=True) tf.logging.info("Building the model ...") # with graph: with tf.Graph().as_default() as graph: cfg.global_step = tf.Variable(0, trainable=False, name='global_step', dtype='int32') # Create a list of input placeholders for each GPU. # When the batchsize is not big enough to fill all of them we # would want to use a subset of the placeholders, but TF raises # a 'negative shape error' if a placeholder is not fed. Instead, # we provide all of them with values but we use n_spits to # select which of the inputs to process (and perform gradient # descent on) and which to ignore. # At runtime, we replicate the input data to feed all the # placeholders (even if it's internally ignored). We could use # placeholder_with_default to assign a value to it's input but # the batch_size might change dynamically, so we rather # replicate the input at runtime. inputs_per_gpu = [] val_inputs_per_gpu = [] labels_per_gpu = [] num_splits = tf.placeholder(np.int32, shape=None, name='num_splits') num_batches = tf.placeholder(np.int32, shape=None, name='num_batches') for i, _ in enumerate(range(cfg.num_splits)): inputs_per_gpu.append(tf.placeholder( dtype=cfg._FLOATX, shape=cfg.input_shape, name='inputs_per_gpu_%i' % i)) val_inputs_per_gpu.append(tf.placeholder( dtype=cfg._FLOATX, shape=cfg.val_input_shape, name='val_inputs_per_gpu_%i' % i)) labels_per_gpu.append(tf.placeholder( dtype=np.int32, shape=[None], # flattened name='labels_per_gpu_%i' % i)) prev_err = tf.placeholder(shape=(), dtype=cfg._FLOATX, name='prev_err') placeholders = [inputs_per_gpu, labels_per_gpu, num_splits, num_batches, prev_err] val_placeholders = [val_inputs_per_gpu, labels_per_gpu, num_splits, num_batches] # Learning rate schedule if cfg.lr_decay is None: lr = cfg.lr elif cfg.lr_decay == 'exp': lr = tf.train.exponential_decay(cfg.lr, cfg.global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) elif cfg.lr_decay == 'piecewise': lr = tf.train.piecewise_constant(cfg.global_step, cfg.lr_boundaries, cfg.lr_values) elif cfg.lr_decay == 'polynomial': lr = tf.train.polynomial_decay(cfg.lr, cfg.global_step, cfg.decay_steps, end_learning_rate=cfg.end_lr, power=cfg.power, cycle=cfg.staircase) elif cfg.lr_decay == 'natural_exp': lr = tf.train.natural_exp_decay(cfg.lr, cfg.global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) elif cfg.lr_decay == 'inverse_time': lr = tf.train.inverse_time_decay(cfg.lr, cfg.global_step, cfg.decay_steps, cfg.decay_rate, staircase=cfg.staircase) else: raise NotImplementedError() cfg.Optimizer = cfg.Optimizer(learning_rate=lr, **cfg.optimizer_params) # Model compilation # ----------------- # Model parameters on the FIRST device specified in cfg.devices # Gradient Average and the rest of the operations are on CPU with tf.device('/cpu:0'): # Build the training graph train_outs, train_summary_ops, _ = build_graph( placeholders, cfg.input_shape, build_model, 'train') # Build the validation graphs (reusing variables) val_outs = {} val_summary_ops = {} val_reset_cm_ops = {} for s in ['eval_' + v for v in cfg.val_on_sets]: ret = build_graph( val_placeholders, cfg.val_input_shape, build_model, s) val_outs[s], val_summary_ops[s], val_reset_cm_ops[s] = ret # Add the hyperparameters summaries if cfg.hyperparams_summaries is not None: sum_text = [] for (key_header, list_value) in cfg.hyperparams_summaries.iteritems(): header_list = [] text_list = [] for v in list_value: header_list.append('**'+v+'**') text_list.append(str(getattr(cfg, v))) header_tensor = tf.constant(header_list) text_tensor = tf.constant(text_list) sum_text.append(tf.summary.text( key_header, tf.reshape( tf.concat([header_tensor, text_tensor], axis=0), [2, -1]))) sum_text_op = tf.summary.merge(sum_text) # Group global and local init into one op. Could be split into # two different ops and passed to `init_op` and `local_init_op` init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver(max_to_keep=cfg.checkpoints_to_keep) # Start the session # ------------------ sv = Supervisor( graph=graph, init_op=init_op, summary_op=None, global_step=cfg.global_step, # TODO add option to restore best rather than last? logdir=cfg.checkpoints_dir, checkpoint_basename=cfg.model_name, saver=saver, # session_manager # summary_writer save_model_secs=300) cfg.sv = sv with sv.managed_session(cfg.supervisor_master, tf_config) as sess: cfg.sess = sess if cfg.debug: from tensorflow.python import debug as tf_debug sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) if cfg.hyperparams_summaries is not None: # write Hyper parameters text summaries summary_str = cfg.sess.run(sum_text_op) sv.summary_computed(cfg.sess, summary_str) if not cfg.do_validation_only: # Start training loop main_loop_kwags = {'placeholders': placeholders, 'val_placeholders': val_placeholders, 'train_outs': train_outs, 'train_summary_ops': train_summary_ops, 'val_outs': val_outs, 'val_summary_ops': val_summary_ops, 'val_reset_cm_ops': val_reset_cm_ops, 'loss_fn': cfg.loss_fn, 'Dataset': cfg.Dataset, 'dataset_params': cfg.dataset_params, 'valid_params': cfg.valid_params, 'sv': sv, 'saver': saver} return main_loop(**main_loop_kwags) else: # Perform validation only mean_iou = {} for s in cfg.val_on_sets: from validate import validate mean_iou[s] = validate( val_placeholders, val_outs['eval_' + s], val_summary_ops['eval_' + s], val_reset_cm_ops['eval_' + s], which_set='eval_' + s)
def run_experiment(option, use_basic_dataset): BATCH_SIZE = 128 NUMBER_BATCHES = 1000 print('Building model..') if option.startswith('single_layer_gru_blind_'): look_behind = 0 num_grus = int(option.split('_')[-1]) network_block = build_token_level_RVAE(num_grus, TOKEN_EMB_SIZE, look_behind_length=0) train_block = build_train_graph_for_RVAE(network_block) elif option.startswith('single_layer_gru_look_behind_'): num_grus = int(option.split('_')[-1]) look_behind = int(option.split('_')[-2]) network_block = build_token_level_RVAE(num_grus, TOKEN_EMB_SIZE, look_behind) train_block = build_train_graph_for_RVAE(network_block, look_behind) else: print('INVALID OPTION') exit(1) print('Setting up data pipeline...') huzzer_kwargs = BASIC_DATASET_ARGS if use_basic_dataset else {} # the generator for fold needs one example at a time, dataset = one_hot_variable_length_token_dataset( batch_size=1, number_of_batches=BATCH_SIZE * NUMBER_BATCHES, cache_path= 'one_hot_token_variable_length_haskell_batch{}_number{}_lookbehind{}'. format(1, NUMBER_BATCHES * BATCH_SIZE, look_behind), zero_front_pad=look_behind, huzzer_kwargs=huzzer_kwargs) # Generator that gets examples def get_example(): while True: yield np.squeeze(dataset()[0], axis=0) logdir = 'experiments/Recurrent_VAE_baseline/{}{}'.format( 'basic_' if use_basic_dataset else '', option) # compile and build the train op compiler = td.Compiler.create(train_block) metrics = compiler.metric_tensors kl_loss = tf.reduce_mean(metrics['kl_loss']) cross_entropy_loss = tf.reduce_mean(metrics['cross_entropy_loss']) total_loss_op = kl_loss + cross_entropy_loss tf.summary.scalar('cross_entropy_loss', cross_entropy_loss) tf.summary.scalar('kl_loss', kl_loss) tf.summary.scalar('total_loss', total_loss_op) optimizer = tf.train.AdamOptimizer(1e-3) train_op = slim.learning.create_train_op(total_loss_op, optimizer) summary_op = tf.summary.merge_all() sv = Supervisor( logdir=logdir, save_model_secs=60, summary_op=None, ) print('training...') with sv.managed_session() as sess: batcher = compiler.build_loom_input_batched(get_example(), BATCH_SIZE) steps_per_summary = 10 best_loss_so_far = 100 num_steps_until_best = 0 for i, batch in enumerate(batcher): if sv.should_stop(): break encoder_sequence_length_t = compiler.metric_tensors[ 'encoder_sequence_length'] decoder_sequence_length_t = compiler.metric_tensors[ 'decoder_sequence_length'] le, ld, summary, global_step, total_loss, _ = sess.run( [ encoder_sequence_length_t, decoder_sequence_length_t, summary_op, sv.global_step, total_loss_op, train_op ], feed_dict={compiler.loom_input_tensor: batch}) assert all(le == ld), \ 'the encoder is folding over a different length sequence to encoder' if i % steps_per_summary == 0: sv.summary_computed(sess, summary, global_step) # Stop if loss does not improve after some steps if total_loss < best_loss_so_far: best_loss_so_far = total_loss num_steps_until_best = 0 else: num_steps_until_best += 1 if num_steps_until_best == NUM_STEPS_TO_STOP_IF_NO_IMPROVEMENT: exit()