def generate_data_for_registered_problem(problem_name): """Generate data for a registered problem.""" tf.logging.info("Generating data for %s.", problem_name) if FLAGS.num_shards: raise ValueError( "--num_shards should not be set for registered Problem.") problem = registry.problem(problem_name) task_id = None if FLAGS.task_id < 0 else FLAGS.task_id data_dir = os.path.expanduser(FLAGS.data_dir) tmp_dir = os.path.expanduser(FLAGS.tmp_dir) if task_id is None and problem.multiprocess_generate: if FLAGS.task_id_start != -1: assert FLAGS.task_id_end != -1 task_id_start = FLAGS.task_id_start task_id_end = FLAGS.task_id_end else: task_id_start = 0 task_id_end = problem.num_generate_tasks # BEGIN GOOGLE-INTERNAL # For some reason, the process hangs if the data is on cns and pool is # created after prepare_to_generate. Dunno why, but this order seems to # work. # END GOOGLE-INTERNAL pool = multiprocessing.Pool(processes=FLAGS.num_concurrent_processes) problem.prepare_to_generate(data_dir, tmp_dir) args = [(problem_name, data_dir, tmp_dir, task_id) for task_id in range(task_id_start, task_id_end)] pool.map(generate_data_in_process, args) else: problem.generate_data(data_dir, tmp_dir, task_id)
def add_problem_hparams(hparams, problem_name_or_instance): """Add problem hparams for the problems.""" if isinstance(problem_name_or_instance, Problem): problem = problem_name_or_instance else: problem = registry.problem(problem_name_or_instance) p_hparams = problem.get_hparams(hparams) hparams.problem = problem hparams.problem_hparams = p_hparams
def generate_data_in_process(arg): problem_name, data_dir, tmp_dir, task_id = arg problem = registry.problem(problem_name) problem.generate_data(data_dir, tmp_dir, task_id)
def create_experiment(run_config, hparams, model_name, problem_name, data_dir, train_steps, eval_steps, min_eval_frequency=2000, schedule="train_and_evaluate", decode_hparams=None, eval_timeout_mins=240, use_tpu=False, train_with_low_level_api=False, decode_with_low_level_api=False, train_and_decode_with_low_level_api=False, tpu_num_hosts=1, iterations_per_loop=1000, decode_from_file=None, decode_to_file=None, decode_reference=None): """Create Experiment.""" # HParams hparams.add_hparam("model_dir", run_config.model_dir) hparams.add_hparam("data_dir", data_dir) hparams.add_hparam("train_steps", train_steps) hparams.add_hparam("eval_steps", eval_steps) hparams.add_hparam("schedule", schedule) hparams.add_hparam("eval_freq_in_steps", min_eval_frequency) hparams.add_hparam("eval_timeout_mins", eval_timeout_mins) hparams.add_hparam("train_with_low_level_api", train_with_low_level_api) hparams.add_hparam("decode_with_low_level_api", decode_with_low_level_api) hparams.add_hparam("train_and_decode_with_low_level_api", train_and_decode_with_low_level_api) if decode_hparams is not None: decode_hparams.add_hparam("decode_from_file", decode_from_file) decode_hparams.add_hparam("decode_to_file", decode_to_file) decode_hparams.add_hparam("decode_reference", decode_reference) add_problem_hparams(hparams, problem_name) # Input fns from Problem problem = hparams.problem train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams) eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams) if train_with_low_level_api: params = {} params["batch_size"] = problem.tpu_batch_size_per_shard(hparams) params["tpu_num_hosts"] = tpu_num_hosts mlp_log.mlperf_print(key="global_batch_size", value=params["batch_size"] * run_config.tpu_config.num_shards) trunner = train_low_level_runner.TrainLowLevelRunner( iterations=iterations_per_loop) model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) trunner.initialize(train_input_fn, model_fn, params, hparams, run_config) if decode_with_low_level_api: if decode_hparams.batch_size: hparams.batch_size = decode_hparams.batch_size hparams.use_fixed_batch_size = True dataset_kwargs = { "shard": decode_hparams.shard_id if decode_hparams.shards > 1 else None, "dataset_split": tf.estimator.ModeKeys.EVAL, "max_records": decode_hparams.num_samples } infer_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs) params = {} # Currently, the decoding part runs on a donut, will change this for # distibuted eval. params["batch_size"] = int(decode_hparams.batch_size * tpu_num_hosts / run_config.tpu_config.num_shards) erunner = eval_low_level_runner.EvalLowLevelRunner(eval_steps=int( math.ceil(decode_hparams.num_samples / decode_hparams.batch_size))) model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) erunner.initialize(infer_input_fn, params, run_config) erunner.build_model(model_fn, params, run_config) if train_and_decode_with_low_level_api: mlp_log.mlperf_print(key="max_sequence_length", value=hparams.max_length) fake_train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, fake_data=True) params = {} params["batch_size"] = problem.tpu_batch_size_per_shard(hparams) params["tpu_num_hosts"] = tpu_num_hosts mlp_log.mlperf_print(key="global_batch_size", value=params["batch_size"] * run_config.tpu_config.num_shards) runner = low_level_runner.LowLevelRunner( iterations=iterations_per_loop, eval_steps=int( math.ceil(decode_hparams.num_samples / decode_hparams.batch_size))) model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) # Changed the problem to unpacked one for decoding. if "_packed" in hparams.problem.name: problem = registry.problem( hparams.problem.name.replace("_packed", "")) p_hparams = problem.get_hparams(hparams) hparams.problem = problem hparams.problem_hparams = p_hparams # Hard-coded based on the current wmt14 en-de eval dataset. hparams.max_length = 97 if decode_hparams.batch_size: hparams.batch_size = decode_hparams.batch_size hparams.use_fixed_batch_size = True dataset_kwargs = { "shard": decode_hparams.shard_id if decode_hparams.shards > 1 else None, "dataset_split": tf.estimator.ModeKeys.EVAL, "max_records": decode_hparams.num_samples } fake_infer_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.PREDICT, hparams, fake_data=True, dataset_kwargs=dataset_kwargs) infer_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.PREDICT, hparams, dataset_kwargs=dataset_kwargs) infer_model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) runner.initialize(fake_train_input_fn, fake_infer_input_fn, train_input_fn, infer_input_fn, model_fn, infer_model_fn, params, hparams, run_config) # Estimator estimator = create_estimator(model_name, hparams, run_config, schedule=schedule, decode_hparams=decode_hparams, use_tpu=use_tpu) # Eval on TPU Pods is not supported yet if use_tpu and run_config.tpu_config.num_shards > 8 and "eval" in schedule: raise ValueError("Eval is not currently supported on a TPU Pod") train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_steps) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_steps, start_delay_secs=0 if hparams.schedule == "evaluate" else 120, exporters=None) return T2TExperiment( estimator, hparams, train_spec, eval_spec, decode_hparams, trunner if train_with_low_level_api else None, erunner if decode_with_low_level_api else None, runner if train_and_decode_with_low_level_api else None)
def train_and_decode(self): """Does decode after training every eval_freq_in_steps.""" eval_steps = self._hparams.eval_freq_in_steps if self._hparams.train_and_decode_with_low_level_api: self._runner.train_and_eval(self._train_spec.max_steps, self._hparams.batch_size) for i in range(0, self._train_spec.max_steps, eval_steps): if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = i + eval_steps output_dir = os.path.join(self._estimator.model_dir, "decode") tf.gfile.MakeDirs(output_dir) output_dirs = [output_dir] result = list(self._runner.dequeue(self._decode_hparams)) mlp_log.mlperf_print( "eval_start", None, metadata={"epoch_num": (i // eval_steps + 1)}) predictions = [] inputs_vocab = self._hparams.problem_hparams.vocabulary[ "inputs"] targets_vocab = self._hparams.problem_hparams.vocabulary[ "targets"] for prediction in result: inputs = prediction.get("inputs") targets = prediction.get("targets") outputs = prediction.get("outputs") if not re.match( "^({})+$".format(text_encoder.PAD), inputs_vocab.decode( decoding.save_until_eos(inputs))): predictions.append( (targets_vocab.decode( decoding.save_until_eos(outputs)), targets_vocab.decode( decoding.save_until_eos(targets)))) decoding.run_postdecode_hooks( decoding.DecodeHookArgs( estimator=self._estimator, problem=self._hparams.problem, output_dirs=output_dirs, hparams=self._hparams, decode_hparams=self._decode_hparams, predictions=predictions), tf.estimator.ModeKeys.EVAL) mlp_log.mlperf_print("block_stop", None, metadata={ "first_epoch_num": (i // eval_steps + 1), "epoch_count": 1 }) if self._hparams.mlperf_mode and self._decode_hparams.mlperf_success: break self._runner.shutdown() else: mlp_log.mlperf_print(key="init_stop", value=None) mlp_log.mlperf_print(key="run_start", value=None) packed_dataset = "_packed" in self._hparams.problem.name for i in range(0, self._train_spec.max_steps, eval_steps): mlp_log.mlperf_print("block_start", None, metadata={ "first_epoch_num": (i // eval_steps + 1), "epoch_count": 1 }) if packed_dataset and i > 0: problem = registry.problem(self._hparams.problem.name + "_packed") p_hparams = problem.get_hparams(self._hparams) self._hparams.problem = problem self._hparams.problem_hparams = p_hparams self._estimator.train(self._train_spec.input_fn, steps=eval_steps, hooks=self._train_spec.hooks) if packed_dataset: problem = registry.problem( self._hparams.problem.name.replace("_packed", "")) p_hparams = problem.get_hparams(self._hparams) self._hparams.problem = problem self._hparams.problem_hparams = p_hparams if self._hparams.mlperf_mode: self._decode_hparams.mlperf_decode_step = i + eval_steps predictions = self.decode( dataset_split=tf.estimator.ModeKeys.EVAL) mlp_log.mlperf_print("block_stop", None, metadata={ "first_epoch_num": (i // eval_steps + 1), "epoch_count": 1 }) if self._hparams.mlperf_mode and self._decode_hparams.mlperf_success: break if self._hparams.mlperf_mode and not self._decode_hparams.mlperf_success: mlp_log.mlperf_print("run_stop", None, metadata={"status": "abort"}) return predictions, self._train_spec.max_steps
def score_file(filename): """Score each line in a file and return the scores.""" # Prepare model. hparams = create_hparams() encoders = registry.problem(FLAGS.problem).feature_encoders(FLAGS.data_dir) has_inputs = "inputs" in encoders # Prepare features for feeding into the model. if has_inputs: inputs_ph = tf.placeholder(dtype=tf.int32) # Just length dimension. batch_inputs = tf.reshape(inputs_ph, [1, -1, 1, 1]) # Make it 4D. targets_ph = tf.placeholder(dtype=tf.int32) # Just length dimension. batch_targets = tf.reshape(targets_ph, [1, -1, 1, 1]) # Make it 4D. features = { "inputs": batch_inputs, "targets": batch_targets, } if has_inputs else { "targets": batch_targets } # Prepare the model and the graph when model runs on features. model = registry.model(FLAGS.model)(hparams, tf.estimator.ModeKeys.EVAL) _, losses = model(features) saver = tf.train.Saver() with tf.Session() as sess: # Load weights from checkpoint. ckpts = tf.train.get_checkpoint_state(FLAGS.output_dir) ckpt = ckpts.model_checkpoint_path saver.restore(sess, ckpt) # Run on each line. with tf.gfile.Open(filename) as f: lines = f.readlines() results = [] for line in lines: tab_split = line.split("\t") if len(tab_split) > 2: raise ValueError( "Each line must have at most one tab separator.") if len(tab_split) == 1: targets = tab_split[0].strip() else: targets = tab_split[1].strip() inputs = tab_split[0].strip() # Run encoders and append EOS symbol. targets_numpy = encoders["targets"].encode(targets) + [ text_encoder.EOS_ID ] if has_inputs: inputs_numpy = encoders["inputs"].encode(inputs) + [ text_encoder.EOS_ID ] # Prepare the feed. feed = { inputs_ph: inputs_numpy, targets_ph: targets_numpy } if has_inputs else { targets_ph: targets_numpy } # Get the score. np_loss = sess.run(losses["training"], feed) results.append(np_loss) return results
def problem(name): return registry.problem(name)
def setUpClass(cls): tf.set_random_seed(1) cls.problem = registry.problem("test_problem") cls.data_dir = tempfile.gettempdir() cls.filepatterns = generate_test_data(cls.problem, cls.data_dir)