def tower_ensemble_graph(eval_features, total_graphs, total_params): default_params = total_params[0] # define multi-gpu inferring graph def _tower_infer_graph(features): infer_fns = [] for midx, (graph, params) in enumerate(zip(total_graphs, total_params)): params = copy.copy(params) params.scope_name = params.scope_name + "_ensembler_%d" % midx infer_fns.append(graph.infer_fn(params)) total_encoding_fns, total_decoding_fns = list(zip(*infer_fns)) def _encoding_fn(source): model_state = {} for _midx in range(len(total_encoding_fns)): current_model_state = total_encoding_fns[_midx](source) model_state['ensembler_%d' % _midx] = current_model_state return model_state def _decoding_fn(target, model_state, time): pred_logits = [] for _midx in range(len(total_decoding_fns)): state_describ = "ensembler_%d" % _midx if default_params.search_mode == "cache": current_output = total_decoding_fns[_midx]( target, model_state[state_describ], time) else: current_output = total_decoding_fns[_midx](target, model_state, time) step_logits, step_state = current_output pred_logits.append(step_logits) if default_params.search_mode == "cache": model_state[state_describ] = step_state model_logits = tf.add_n( [tf.nn.softmax(logits) for logits in pred_logits]) / len(pred_logits) return tf.log(model_logits), model_state beam_output = beam_search(features, _encoding_fn, _decoding_fn, default_params) return beam_output # feed model to multiple gpus eval_outputs = parallel.parallel_model(_tower_infer_graph, eval_features, default_params.gpus, use_cpu=(len( default_params.gpus) == 0)) eval_seqs, eval_scores = eval_outputs['seq'], eval_outputs['score'] return eval_seqs, eval_scores
def tower_train_graph(train_features, optimizer, graph, params): # define multi-gpu training graph def _tower_train_graph(features): train_output = graph.train_fn(features, params, initializer=initializer.get_initializer( params.initializer, params.initializer_gain)) tower_gradients = optimizer.compute_gradients( train_output["loss"] * tf.cast(params.loss_scale, tf.float32), colocate_gradients_with_ops=True) tower_gradients = [(g / tf.cast(params.loss_scale, tf.float32), v) for g, v in tower_gradients] return {"loss": train_output["loss"], "gradient": tower_gradients} # feed model to multiple gpus tower_outputs = parallel.parallel_model(_tower_train_graph, train_features, params.gpus, use_cpu=(len(params.gpus) == 0)) loss = tf.add_n(tower_outputs['loss']) / len(tower_outputs['loss']) gradients = parallel.average_gradients(tower_outputs['gradient']) return loss, gradients
def tower_infer_graph(eval_features, graph, params): # define multi-gpu inferring graph def _tower_infer_graph(features): return graph.infer_fn(params, features) # feed model to multiple gpus eval_outputs = parallel.parallel_model(_tower_infer_graph, eval_features, params.gpus, use_cpu=(len(params.gpus) == 0)) return eval_outputs
def tower_score_graph(eval_features, graph, params): # define multi-gpu inferring graph def _tower_infer_graph(features): scores = graph.score_fn(features, params) return scores # feed model to multiple gpus eval_outputs = parallel.parallel_model( _tower_infer_graph, eval_features, params.gpus, use_cpu=(len(params.gpus) == 0)) eval_scores = eval_outputs['score'] return eval_scores
def tower_infer_graph(eval_features, graph, params): # define multi-gpu inferring graph def _tower_infer_graph(features): encoding_fn, decoding_fn = graph.infer_fn(params) beam_output = beam_search(features, encoding_fn, decoding_fn, params) return beam_output # feed model to multiple gpus eval_outputs = parallel.parallel_model( _tower_infer_graph, eval_features, params.gpus, use_cpu=(len(params.gpus) == 0)) eval_seqs, eval_scores = eval_outputs['seq'], eval_outputs['score'] return eval_seqs, eval_scores
def tower_train_graph(train_features, optimizer, params): # define multi-gpu training graph def _tower_train_graph(features): train_output = graph.train_fn( features, params, initializer=tf.random_uniform_initializer(-0.08, 0.08)) tower_gradients = optimizer.compute_gradients( train_output["loss"], colocate_gradients_with_ops=True) return {"loss": train_output["loss"], "gradient": tower_gradients} # feed model to multiple gpus tower_outputs, tower_mask = parallel.parallel_model(_tower_train_graph, train_features, params.gpus, use_cpu=(len( params.gpus) == 0)) loss = parallel.fusion_with_mask(tower_outputs['loss'], tower_mask) gradients = parallel.average_gradients(tower_outputs['gradient'], mask=tower_mask) return loss, gradients
def main(args): tf.logging.set_verbosity(tf.logging.INFO) model_cls = transformer.Transformer args.model = model_cls.get_name() params = default_parameters() # Import and override parameters # Priorities (low -> high): # default -> saved -> command params = merge_parameters(params, model_cls.get_parameters()) params = import_params(args.output, args.model, params) override_parameters(params, args) # Export all parameters and model specific parameters export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % args.model, collect_params(params, model_cls.get_parameters())) #tf.set_random_seed(params.seed) # Build Graph with tf.Graph().as_default(): # Build input queue features = dataset.get_training_input(params.input, params) # features, init_op = cache.cache_features(features, params.update_cycle) # Add pre_trained_embedding: if params.use_pretrained_embedding: _, src_embs = dataset.get_pre_embeddings(params.embeddings[0]) _, trg_embs = dataset.get_pre_embeddings(params.embeddings[1]) features['src_embs'] = src_embs features['trg_embs'] = trg_embs print('Loaded Embeddings!', src_embs.shape, trg_embs.shape) # Build model initializer = get_initializer(params) model = model_cls(params, args.model) # Multi-GPU setting sharded_losses = parallel.parallel_model( model.get_training_func(initializer), features, params.device_list) loss = tf.add_n(sharded_losses) / len(sharded_losses) # Create global step global_step = tf.train.get_or_create_global_step() initial_global_step = global_step.assign(0) # Print parameters all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 for v_name in sorted(list(all_weights)): v = all_weights[v_name] tf.logging.info("%s\tshape %s", v.name[:-2].ljust(80), str(v.shape).ljust(20)) v_size = np.prod(np.array(v.shape.as_list())).tolist() total_size += v_size tf.logging.info("Total trainable variables size: %d", total_size) learning_rate = get_learning_rate_decay(params.learning_rate, global_step, params) if params.learning_rate_minimum: lr_min = float(params.learning_rate_minimum) learning_rate = tf.maximum(learning_rate, tf.to_float(lr_min)) learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32) tf.summary.scalar("learning_rate", learning_rate) # Create optimizer if params.optimizer == "Adam": opt = tf.train.AdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) elif params.optimizer == "LazyAdam": opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) else: raise RuntimeError("Optimizer %s not supported" % params.optimizer) loss, ops = optimize.create_train_op(loss, opt, global_step, params) restore_op = restore_variables(args.output) # Validation if params.validation and params.references[0]: files = [params.validation] + list(params.references) eval_inputs = dataset.sort_and_zip_files(files) eval_input_fn = dataset.get_evaluation_input else: eval_input_fn = None # Add hooks save_vars = tf.trainable_variables() + [global_step] saver = tf.train.Saver( var_list=save_vars if params.only_save_trainable else None, max_to_keep=params.keep_checkpoint_max, sharded=False) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) train_hooks = [ tf.train.StopAtStepHook(last_step=params.train_steps), #tf.train.StopAtStepHook(num_steps=params.train_steps), tf.train.NanTensorHook(loss), tf.train.LoggingTensorHook({ "step": global_step, "loss": loss, }, every_n_iter=params.print_steps), tf.train.CheckpointSaverHook( checkpoint_dir=params.output, save_secs=params.save_checkpoint_secs or None, save_steps=params.save_checkpoint_steps or None, saver=saver) ] config = session_config(params) if eval_input_fn is not None: train_hooks.append( hooks.EvaluationHook( lambda f: beamsearch.create_inference_graph( [model.get_inference_func()], f, params), lambda: eval_input_fn(eval_inputs, params), lambda x: decode_target_ids(x, params), params.output, config, params.keep_top_checkpoint_max, eval_steps_begin=params.eval_steps_begin, eval_secs=params.eval_secs, eval_steps=params.eval_steps)) def restore_fn(step_context): step_context.session.run(restore_op) def step_fn(step_context): # Bypass hook calls return step_context.run_with_hooks(ops) # Create session, do not use default CheckpointSaverHook with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output, hooks=train_hooks, save_checkpoint_secs=None, config=config) as sess: #sess.run(features['source'].eval()) #sess.run(features['target'].eval()) # Restore pre-trained variables sess.run_step_fn(restore_fn) if params.renew_lr == True: sess.run(initial_global_step) while not sess.should_stop(): sess.run_step_fn(step_fn)
def main(args): tf.logging.set_verbosity(tf.logging.INFO) model_cls = models.get_model(args.model) params = default_parameters() params = merge_parameters(params, model_cls.get_parameters()) params = import_params(args.output, args.model, params) override_parameters(params, args) export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % args.model, collect_params(params, model_cls.get_parameters())) with tf.Graph().as_default(): features = dataset.get_training_input(params.input, params) update_cycle = params.update_cycle features, init_op = cache.cache_features(features, update_cycle) initializer = get_initializer(params) regularizer = tf.contrib.layers.l1_l2_regularizer( scale_l1=params.scale_l1, scale_l2=params.scale_l2) model = model_cls(params) global_step = tf.train.get_or_create_global_step() sharded_losses = parallel.parallel_model( model.get_training_func(initializer, regularizer), features, params.device_list) loss = tf.add_n(sharded_losses) / len(sharded_losses) loss = loss + tf.losses.get_regularization_loss() all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 for v_name in sorted(list(all_weights)): v = all_weights[v_name] tf.logging.info("%s\tshape %s", v.name[:-2].ljust(80), str(v.shape).ljust(20)) v_size = np.prod(np.array(v.shape.as_list())).tolist() total_size += v_size tf.logging.info("Total trainable variables size: %d", total_size) learning_rate = get_learning_rate_decay(params.learning_rate, global_step, params) learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32) tf.summary.scalar("learning_rate", learning_rate) if params.optimizer == "Adam": opt = tf.train.AdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) elif params.optimizer == "LazyAdam": opt = tf.contrib.opt.LazyAdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) elif params.optimizer == "SGD": opt = tf.train.GradientDescentOptimizer(learning_rate) else: raise RuntimeError("Optimizer %s not supported" % params.optimizer) loss, ops = optimize.create_train_op(loss, opt, global_step, params) restore_op = restore_variables(args.checkpoint) if params.validation: eval_sorted_keys, eval_inputs = dataset.read_eval_input_file( params.validation) eval_input_fn = dataset.get_predict_input else: eval_input_fn = None save_vars = tf.trainable_variables() + [global_step] saver = tf.train.Saver( var_list=save_vars if params.only_save_trainable else None, max_to_keep=params.keep_checkpoint_max, sharded=False) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) multiplier = tf.convert_to_tensor([update_cycle, 1]) train_hooks = [ tf.train.StopAtStepHook(last_step=params.train_steps), tf.train.NanTensorHook(loss), tf.train.LoggingTensorHook( { "step": global_step, "loss": loss, "text": tf.shape(features["text"]) * multiplier, "aspect": tf.shape(features["aspect"]) * multiplier, "polarity": tf.shape(features["polarity"]) * multiplier }, every_n_iter=1), tf.train.CheckpointSaverHook( checkpoint_dir=params.output, save_secs=params.save_checkpoint_secs or None, save_steps=params.save_checkpoint_steps or None, saver=saver) ] config = session_config(params) if eval_input_fn is not None: train_hooks.append( hooks.EvaluationHook( lambda f: inference.create_predict_graph([model], f, params ), lambda: eval_input_fn(eval_inputs, params), params.output, config, params.keep_top_checkpoint_max, eval_secs=params.eval_secs, eval_steps=params.eval_steps)) def restore_fn(step_context): step_context.session.run(restore_op) def step_fn(step_context): step_context.session.run([init_op, ops["zero_op"]]) for i in range(update_cycle - 1): step_context.session.run(ops["collect_op"]) return step_context.run_with_hooks(ops["train_op"]) with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output, hooks=train_hooks, save_checkpoint_secs=None, config=config) as sess: sess.run_step_fn(restore_fn) while not sess.should_stop(): sess.run_step_fn(step_fn)
def main(args): tf.logging.set_verbosity(tf.logging.INFO) params = default_parameters() override_parameters(params, args) export_params(params.output, "params.json", params) # Build Graph with tf.Graph().as_default(): dataset.start_queue(params) features = dataset.get_train_input(params) print(features) # Build model initializer = get_initializer(params) # model = LineBased.Model(params) model = pixellink.PixelLinkNetwork(params) # Multi-GPU setting sharded_losses, ((sum_img, sum_loss), *_) = parallel.parallel_model( model.get_training_func(initializer), features, params.device_list) loss = tf.add_n(sharded_losses) / len(sharded_losses) # Create global step global_step = tf.train.get_or_create_global_step() # Print parameters all_weights = {v.name: v for v in tf.trainable_variables()} total_size = 0 for v_name in sorted(list(all_weights)): v = all_weights[v_name] tf.logging.info("%s\tshape %s", v.name[:-2].ljust(80), str(v.shape).ljust(20)) v_size = np.prod(np.array(v.shape.as_list())).tolist() total_size += v_size tf.logging.info("Total trainable variables size: %d", total_size) learning_rate = get_learning_rate_decay(params.learning_rate, global_step, params) learning_rate = tf.convert_to_tensor(learning_rate, dtype=tf.float32) # weitght decay weights = tf.trainable_variables() with tf.variable_scope('weights_norm') as scope: weights_norm = tf.reduce_sum( input_tensor=params.weight_decay * tf.stack([tf.nn.l2_loss(v) for v in weights]), name='weights_norm') loss = loss + weights_norm tf.summary.scalar('total_loss', loss) print('create opt') if params.optimizer == 'adam': # Create optimizer opt = tf.train.AdamOptimizer(learning_rate, beta1=params.adam_beta1, beta2=params.adam_beta2, epsilon=params.adam_epsilon) elif params.optimizer == 'sgd_momentum': opt = tf.train.MomentumOptimizer(learning_rate, momentum=params.momentum) else: raise NotImplementedError() train_op = tf.contrib.layers.optimize_loss( name="training", loss=loss, global_step=global_step, learning_rate=learning_rate, clip_gradients=params.clip_grad_norm or None, optimizer=opt, colocate_gradients_with_ops=True) print('create hooks') # Add hooks train_hooks = [ tf.train.StopAtStepHook(last_step=params.train_steps), tf.train.NanTensorHook(loss), tf.train.LoggingTensorHook({ "step": global_step, "loss": loss, }, every_n_iter=1), tf.train.CheckpointSaverHook( checkpoint_dir=params.output, save_secs=params.save_checkpoint_secs or None, save_steps=params.save_checkpoint_steps or None, saver=tf.train.Saver(max_to_keep=params.keep_checkpoint_max, sharded=False)), tf.train.SummarySaverHook(save_steps=20, save_secs=None, output_dir=os.path.join( params.output, "sumimg"), summary_op=sum_img), tf.train.SummarySaverHook(save_steps=1, save_secs=None, output_dir=os.path.join( params.output, "sumloss"), summary_op=sum_loss) ] config = session_config(params) train_hooks.append( hooks.EvaluationHook(model.get_evaluation_func(), dataset.get_eval_input, params.output, config, params.keep_top_checkpoint_max, eval_secs=params.eval_secs, eval_steps=params.eval_steps)) print('create session') # Create session, do not use default CheckpointSaverHook with tf.train.MonitoredTrainingSession(checkpoint_dir=params.output, hooks=train_hooks, save_checkpoint_secs=None, config=config) as sess: # coord = tf.train.Coordinator() # threads = tf.train.start_queue_runners(sess=sess, coord=coord) while not sess.should_stop(): sess.run(train_op)