Esempio n. 1
0
  def __decode_from_file(self, filename):
    """Compute predictions on entries in filename and write them out."""

    if not self.decode_hp.batch_size:
      self.decode_hp.batch_size = 32
      tf.logging.info("decode_hp.batch_size not specified; default=%d" %
                      self.decode_hp.batch_size)

    problem_id = self.decode_hp.problem_idx
    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    inputs_vocab = self.hparams.problems[problem_id].vocabulary["inputs"]
    targets_vocab = self.hparams.problems[problem_id].vocabulary["targets"]
    problem_name = "grapheme_to_phoneme_problem"
    tf.logging.info("Performing decoding from a file.")
    inputs = _get_inputs(filename)
    num_decode_batches = (len(inputs) - 1) // self.decode_hp.batch_size + 1

    def input_fn():
      """Function for inputs generator."""
      input_gen = _decode_batch_input_fn(
          num_decode_batches, inputs, inputs_vocab,
          self.decode_hp.batch_size, self.decode_hp.max_input_size)
      gen_fn = decoding.make_input_fn_from_generator(input_gen)
      example = gen_fn()
      return decoding._decode_input_tensor_to_features_dict(example,
                                                            self.hparams)

    decodes = []
    result_iter = self.estimator.predict(input_fn)
    for result in result_iter:
      if self.decode_hp.return_beams:
        beam_decodes = []
        output_beams = np.split(result["outputs"], self.decode_hp.beam_size,
                                axis=0)
        for k, beam in enumerate(output_beams):
          tf.logging.info("BEAM %d:" % k)
          _, decoded_outputs, _ = decoding.log_decode_results(
              result["inputs"],
              beam,
              problem_name,
              None,
              inputs_vocab,
              targets_vocab)
          beam_decodes.append(decoded_outputs)
        decodes.append(beam_decodes)
      else:
        _, decoded_outputs, _ = decoding.log_decode_results(
            result["inputs"],
            result["outputs"],
            problem_name,
            None,
            inputs_vocab,
            targets_vocab)
        decodes.append(decoded_outputs)

    return [inputs, decodes]
Esempio n. 2
0
def decode_interactively(estimator,
                         input_generator,
                         problem_name,
                         hparams,
                         decode_hp,
                         checkpoint_path=None):
    """Compute predictions on entries in filename and write them out."""
    decode_hp.batch_size = 1
    tf.logging.info("decode_hp.batch_size not specified; default=%d" %
                    decode_hp.batch_size)

    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    p_hp = hparams.problem_hparams
    has_input = "inputs" in p_hp.vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = p_hp.vocabulary[inputs_vocab_key]
    targets_vocab = p_hp.vocabulary["targets"]

    length = getattr(hparams, "length", 0) or hparams.max_length

    def input_fn_gen():
        for line in input_generator:
            if has_input:
                ids = inputs_vocab.encode(line.strip()) + [1]
            else:
                ids = targets_vocab.encode(line)
            if len(ids) < length:
                ids.extend([0] * (length - len(ids)))
            else:
                ids = ids[:length]
            np_ids = np.array(ids, dtype=np.int32)
            yield dict(inputs=np_ids.reshape((length, 1, 1)))

    def input_fn(params):
        return tf.data.Dataset.from_generator(
            input_fn_gen,
            output_types=dict(inputs=tf.int32, ),
            output_shapes=dict(inputs=(length, 1, 1))).batch(1)

    result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)

    for result in result_iter:
        _, decoded_outputs, _ = decoding.log_decode_results(
            result["inputs"],
            result["outputs"],
            problem_name,
            None,
            inputs_vocab,
            targets_vocab,
            log_results=False,
            skip_eos_postprocess=decode_hp.skip_eos_postprocess)
        yield decoded_outputs
Esempio n. 3
0
def decode_from_text_file(estimator,
                          problem_name,
                          filename,
                          hparams,
                          decode_hp,
                          decode_to_file=None,
                          checkpoint_path=None):
    """Compute predictions on entries in filename and write them out."""
    if not decode_hp.batch_size:
        decode_hp.batch_size = 32
        tf.logging.info("decode_hp.batch_size not specified; default=%d" %
                        decode_hp.batch_size)

    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    p_hp = hparams.problem_hparams
    has_input = "inputs" in p_hp.vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = p_hp.vocabulary[inputs_vocab_key]
    targets_vocab = p_hp.vocabulary["targets"]
    filename = decoding._add_shard_to_filename(filename, decode_hp)
    tf.logging.info("Performing decoding from file (%s)." % filename)
    if has_input:
        sorted_inputs, sorted_keys = decoding._get_sorted_inputs(
            filename, decode_hp.delimiter)
    else:
        sorted_inputs = decoding._get_language_modeling_inputs(
            filename, decode_hp.delimiter, repeat=decode_hp.num_decodes)
        sorted_keys = range(len(sorted_inputs))

    # If decode_to_file was provided use it as the output filename without change
    # (except for adding shard_id if using more shards for decoding).
    # Otherwise, use the input filename plus model, hp, problem, beam, alpha.
    decode_filename = decode_to_file if decode_to_file else filename
    if not decode_to_file:
        decode_filename = decoding._decode_filename(decode_filename,
                                                    problem_name, decode_hp)
    else:
        decode_filename = decoding._add_shard_to_filename(
            decode_filename, decode_hp)
    tf.logging.info("Writing decodes into %s" % decode_filename)

    # Check for decoding checkpoint.
    decodes = []
    shuffle_file_path = decode_filename + '.shuffle.txt'
    if tf.gfile.Exists(shuffle_file_path):
        with tf.gfile.Open(shuffle_file_path, 'r') as f:
            decodes = [line.strip() for line in f.readlines()]
        tf.logging.info('Read {} sentences from checkpoint.'.format(
            len(decodes)))

    all_sorted_inputs = sorted_inputs
    # We only need to decode these inputs:
    sorted_inputs = sorted_inputs[len(decodes):]

    # We don't need to waste computation on empty lines:
    num_empty_lines = 0
    while sorted_inputs and sorted_inputs[-1] == '':
        num_empty_lines += 1
        sorted_inputs.pop(-1)

    num_sentences = len(sorted_inputs)
    num_decode_batches = (num_sentences - 1) // decode_hp.batch_size + 1

    if estimator.config.use_tpu:
        length = getattr(hparams, "length", 0) or hparams.max_length
        batch_ids = []
        for line in sorted_inputs:
            if has_input:
                ids = inputs_vocab.encode(line.strip()) + [1]
            else:
                ids = targets_vocab.encode(line)
            if len(ids) < length:
                ids.extend([0] * (length - len(ids)))
            else:
                ids = ids[:length]
            batch_ids.append(ids)
        np_ids = np.array(batch_ids, dtype=np.int32)

        def input_fn(params):
            batch_size = params["batch_size"]
            dataset = tf.data.Dataset.from_tensor_slices({"inputs": np_ids})
            dataset = dataset.map(
                lambda ex:
                {"inputs": tf.reshape(ex["inputs"], (length, 1, 1))})
            dataset = dataset.batch(batch_size)
            return dataset
    else:

        def input_fn():
            input_gen = decoding._decode_batch_input_fn(
                num_decode_batches,
                sorted_inputs,
                inputs_vocab,
                decode_hp.batch_size,
                decode_hp.max_input_size,
                task_id=-1,
                has_input=has_input)
            gen_fn = decoding.make_input_fn_from_generator(input_gen)
            example = gen_fn()
            return decoding._decode_input_tensor_to_features_dict(
                example, hparams)

    result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)

    start_time = time.time()
    total_time_per_step = 0
    total_cnt = 0

    def timer(gen):
        while True:
            try:
                start_time = time.time()
                item = next(gen)
                elapsed_time = time.time() - start_time
                yield elapsed_time, item
            except StopIteration:
                break

    writing_mode = 'a' if tf.gfile.Exists(shuffle_file_path) else 'w'
    shuffle_file = tf.gfile.Open(shuffle_file_path, writing_mode)
    count = 0
    for elapsed_time, result in timer(result_iter):
        if decode_hp.return_beams:
            beam_decodes = []
            beam_scores = []
            output_beams = np.split(result["outputs"],
                                    decode_hp.beam_size,
                                    axis=0)
            scores = None
            if "scores" in result:
                if np.isscalar(result["scores"]):
                    result["scores"] = result["scores"].reshape(1)
                scores = np.split(result["scores"],
                                  decode_hp.beam_size,
                                  axis=0)
            for k, beam in enumerate(output_beams):
                tf.logging.info("BEAM %d:" % k)
                score = scores and scores[k]
                _, decoded_outputs, _ = decoding.log_decode_results(
                    result["inputs"],
                    beam,
                    problem_name,
                    None,
                    inputs_vocab,
                    targets_vocab,
                    log_results=decode_hp.log_results,
                    skip_eos_postprocess=decode_hp.skip_eos_postprocess)
                beam_decodes.append(decoded_outputs)
                if decode_hp.write_beam_scores:
                    beam_scores.append(score)
            if decode_hp.write_beam_scores:
                decodes.append("\t".join([
                    "\t".join([d, "%.2f" % s])
                    for d, s in zip(beam_decodes, beam_scores)
                ]))
            else:
                decodes.append("\t".join(beam_decodes))
        else:
            _, decoded_outputs, _ = decoding.log_decode_results(
                result["inputs"],
                result["outputs"],
                problem_name,
                None,
                inputs_vocab,
                targets_vocab,
                log_results=decode_hp.log_results,
                skip_eos_postprocess=decode_hp.skip_eos_postprocess)
            decodes.append(decoded_outputs)

        # Write decoded text to checkpoint
        new_decode = decodes[-1]
        shuffle_file.write(new_decode + '\n')

        # Flush checkpoint to storage.
        count += 1
        if count % decode_hp.batch_size == 0:
            tf.logging.info('Done {}/{}. Flushing.'.format(
                count, len(sorted_inputs)))
            shuffle_file.flush()
            shuffle_file.close()
            shuffle_file = tf.gfile.Open(shuffle_file_path, 'a')

        total_time_per_step += elapsed_time
        total_cnt += result["outputs"].shape[-1]

    for _ in range(num_empty_lines):
        decodes.append('')
        shuffle_file.write('\n')

    # Write the final output to file.
    outfile = tf.gfile.Open(decode_filename, "w")
    for index in range(len(all_sorted_inputs)):
        outfile.write("%s%s" %
                      (decodes[sorted_keys[index]], decode_hp.delimiter))
    outfile.flush()
    outfile.close()

    # Close and remove checkpoint.
    shuffle_file.flush()
    shuffle_file.close()
    tf.gfile.Remove(shuffle_file_path)

    # Print some decoding stats.
    duration = time.time() - start_time
    if total_cnt:
        tf.logging.info("Elapsed Time: %5.5f" % duration)
        tf.logging.info(
            "Averaged Single Token Generation Time: %5.7f "
            "(time %5.7f count %d)" %
            (total_time_per_step / total_cnt, total_time_per_step, total_cnt))
    if decode_hp.batch_size == 1:
        tf.logging.info("Inference time %.4f seconds "
                        "(Latency = %.4f ms/setences)" %
                        (duration, 1000.0 * duration / num_sentences))
    else:
        tf.logging.info("Inference time %.4f seconds "
                        "(Throughput = %.4f sentences/second)" %
                        (duration, num_sentences / duration))
Esempio n. 4
0
def decode_from_file_search_based(estimator,
                                  filename,
                                  hparams,
                                  decode_hp,
                                  decode_to_file=None,
                                  checkpoint_path=None):
    """Compute predictions on entries in filename and write them out."""
    if not decode_hp.batch_size:
        decode_hp.batch_size = 32
        tf.logging.info(
                "decode_hp.batch_size not specified; default=%d" % decode_hp.batch_size)

    problem_id = decode_hp.problem_idx
    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    has_input = "inputs" in hparams.problems[problem_id].vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = hparams.problems[problem_id].vocabulary[inputs_vocab_key]
    targets_vocab = hparams.problems[problem_id].vocabulary["targets"]
    problem_name = FLAGS.problems.split("-")[problem_id]
    tf.logging.info("Performing decoding from a file.")
    sorted_inputs, sorted_keys = _get_sorted_inputs(filename, decode_hp.shards,
                                                    decode_hp.delimiter)
    num_decode_batches = (len(sorted_inputs) - 1) // decode_hp.batch_size + 1

    data_dir = '/'.join(filename.split('/')[:-1])
    table_path = os.path.join(data_dir, '../../search_engine/big_table.txt')
    he_search_path = os.path.join(data_dir, 'he.search.txt')
    en_search_path = os.path.join(data_dir, 'en.search.txt')
    searcher = Searcher(table_path, he_search_path)
    translator = Translator(data_dir, he_search_path)

    def input_fn():
        input_gen = _decode_batch_input_fn_search_based(
                problem_id, num_decode_batches, sorted_inputs, inputs_vocab, targets_vocab,
                decode_hp.batch_size, decode_hp.max_input_size, searcher, translator,
                hparams.problems[problem_id])
        gen_fn = make_input_fn_from_generator(input_gen)
        example = gen_fn()
        return _decode_input_tensor_to_features_dict(example, hparams)

    decodes = []
    result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)
    for result in result_iter:
        if decode_hp.return_beams:
            beam_decodes = []
            beam_scores = []
            output_beams = np.split(result["outputs"], decode_hp.beam_size, axis=0)
            scores = None
            if "scores" in result:
                scores = np.split(result["scores"], decode_hp.beam_size, axis=0)
            for k, beam in enumerate(output_beams):
                tf.logging.info("BEAM %d:" % k)
                score = scores and scores[k]
                _, decoded_outputs, _ = log_decode_results(result["inputs"], beam,
                                                           problem_name, None,
                                                           inputs_vocab, targets_vocab)
                beam_decodes.append(decoded_outputs)
                if decode_hp.write_beam_scores:
                    beam_scores.append(score)
            if decode_hp.write_beam_scores:
                decodes.append("\t".join(
                        ["\t".join([d, "%.2f" % s]) for d, s
                         in zip(beam_decodes, beam_scores)]))
            else:
                decodes.append("\t".join(beam_decodes))
        else:
            _, decoded_outputs, _ = log_decode_results(
                    result["inputs"], result["outputs"], problem_name,
                    None, inputs_vocab, targets_vocab)
            decodes.append(decoded_outputs)

    # Reversing the decoded inputs and outputs because they were reversed in
    # _decode_batch_input_fn
    sorted_inputs.reverse()
    decodes.reverse()
    # If decode_to_file was provided use it as the output filename without change
    # (except for adding shard_id if using more shards for decoding).
    # Otherwise, use the input filename plus model, hp, problem, beam, alpha.
    decode_filename = decode_to_file if decode_to_file else filename
    if decode_hp.shards > 1:
        decode_filename += "%.2d" % decode_hp.shard_id
    if not decode_to_file:
        decode_filename = _decode_filename(decode_filename, problem_name, decode_hp)
    tf.logging.info("Writing decodes into %s" % decode_filename)
    outfile = tf.gfile.Open(decode_filename, "w")
    for index in range(len(sorted_inputs)):
        outfile.write("%s%s" % (decodes[sorted_keys[index]], decode_hp.delimiter))
Esempio n. 5
0
def decode_from_file_fn(estimator,
                        filename,
                        hparams,
                        decode_hp,
                        decode_to_file=None,
                        checkpoint_path=None):
  """Compute predictions on entries in filename and write them out."""
  if not decode_hp.batch_size:
    decode_hp.batch_size = 32
    tf.logging.info(
        "decode_hp.batch_size not specified; default=%d" % decode_hp.batch_size)

  # Inputs vocabulary is set to targets if there are no inputs in the problem,
  # e.g., for language models where the inputs are just a prefix of targets.
  p_hp = hparams.problem_hparams
  has_input = "inputs" in p_hp.vocabulary
  inputs_vocab_key = "inputs" if has_input else "targets"
  inputs_vocab = p_hp.vocabulary[inputs_vocab_key]
  targets_vocab = p_hp.vocabulary["targets"]
  problem_name = FLAGS.problem
  filename = decoding._add_shard_to_filename(filename, decode_hp)
  tf.logging.info("Performing decoding from file (%s)." % filename)
  if has_input:
    sorted_inputs, sorted_keys = decoding._get_sorted_inputs(
        filename, decode_hp.delimiter)
  else:
    sorted_inputs = decoding._get_language_modeling_inputs(
        filename, decode_hp.delimiter, repeat=decode_hp.num_decodes)
    sorted_keys = range(len(sorted_inputs))
  num_sentences = len(sorted_inputs)
  num_decode_batches = (num_sentences - 1) // decode_hp.batch_size + 1

  if estimator.config.use_tpu:
    length = getattr(hparams, "length", 0) or hparams.max_length
    batch_ids = []
    for line in sorted_inputs:
      if has_input:
        ids = inputs_vocab.encode(line.strip()) + [1]
      else:
        ids = targets_vocab.encode(line)
      if len(ids) < length:
        ids.extend([0] * (length - len(ids)))
      else:
        ids = ids[:length]
      batch_ids.append(ids)
    np_ids = np.array(batch_ids, dtype=np.int32)
    def input_fn(params):
      batch_size = params["batch_size"]
      dataset = tf.data.Dataset.from_tensor_slices({"inputs": np_ids})
      dataset = dataset.map(
          lambda ex: {"inputs": tf.reshape(ex["inputs"], (length, 1, 1))})
      dataset = dataset.batch(batch_size)
      return dataset
  else:
    def input_fn():
      input_gen = decoding._decode_batch_input_fn(
          num_decode_batches, sorted_inputs,
          inputs_vocab, decode_hp.batch_size,
          decode_hp.max_input_size,
          task_id=decode_hp.multiproblem_task_id, has_input=has_input)
      gen_fn = decoding.make_input_fn_from_generator(input_gen)
      example = gen_fn()
      return decoding._decode_input_tensor_to_features_dict(example, hparams, decode_hp)
  decodes = []
  result_iter = estimator.predict(input_fn, checkpoint_path=checkpoint_path)

  start_time = time.time()
  total_time_per_step = 0
  total_cnt = 0

  def timer(gen):
    while True:
      try:
        start_time = time.time()
        item = next(gen)
        elapsed_time = time.time() - start_time
        yield elapsed_time, item
      except StopIteration:
        break

  for elapsed_time, result in timer(result_iter):
    if decode_hp.return_beams:
      beam_decodes = []
      beam_scores = []
      output_beams = np.split(result["outputs"], decode_hp.beam_size, axis=0)
      scores = None
      if "scores" in result:
        if np.isscalar(result["scores"]):
          result["scores"] = result["scores"].reshape(1)
        scores = np.split(result["scores"], decode_hp.beam_size, axis=0)
      for k, beam in enumerate(output_beams):
        tf.logging.info("BEAM %d:" % k)
        score = scores and scores[k]
        _, decoded_outputs, _ = decoding.log_decode_results(
            result["inputs"],
            beam,
            problem_name,
            None,
            inputs_vocab,
            targets_vocab,
            log_results=decode_hp.log_results,
            skip_eos_postprocess=decode_hp.skip_eos_postprocess)
        beam_decodes.append(decoded_outputs)
        if decode_hp.write_beam_scores:
          beam_scores.append(score)
      if decode_hp.write_beam_scores:
        decodes.append("\t".join([
            "\t".join([d, "%.2f" % s])
            for d, s in zip(beam_decodes, beam_scores)
        ]))
      else:
        decodes.append("\t".join(beam_decodes))
    else:
      _, decoded_outputs, _ = decoding.log_decode_results(
          result["inputs"],
          result["outputs"],
          problem_name,
          None,
          inputs_vocab,
          targets_vocab,
          log_results=decode_hp.log_results,
          skip_eos_postprocess=decode_hp.skip_eos_postprocess)
      decodes.append(decoded_outputs)
    total_time_per_step += elapsed_time
    total_cnt += result["outputs"].shape[-1]
  duration = time.time() - start_time
  tf.logging.info("Elapsed Time: %5.5f" % duration)
  tf.logging.info("Averaged Single Token Generation Time: %5.7f "
                  "(time %5.7f count %d)" %
                  (total_time_per_step / total_cnt,
                   total_time_per_step, total_cnt))
  if decode_hp.batch_size == 1:
    tf.logging.info("Inference time %.4f seconds "
                    "(Latency = %.4f ms/setences)" %
                    (duration, 1000.0*duration/num_sentences))
  else:
    tf.logging.info("Inference time %.4f seconds "
                    "(Throughput = %.4f sentences/second)" %
                    (duration, num_sentences/duration))

  # If decode_to_file was provided use it as the output filename without change
  # (except for adding shard_id if using more shards for decoding).
  # Otherwise, use the input filename plus model, hp, problem, beam, alpha.
  decode_filename = decode_to_file if decode_to_file else filename
  if not decode_to_file:
    decode_filename = decoding._decode_filename(decode_filename, problem_name, decode_hp)
  else:
    decode_filename = decoding._add_shard_to_filename(decode_filename, decode_hp)
  tf.logging.info("Writing decodes into %s" % decode_filename)
  outfile = tf.gfile.Open(decode_filename, "w")
  for index in range(len(sorted_inputs)):
    special_chars = ["\a", "\n", "\f", "\r", "\b"]
    output = decodes[sorted_keys[index]]
    for c in special_chars:
      output = output.replace(c, ' ')
    try:
      outfile.write("%s%s" % (output, decode_hp.delimiter))
    except:
      outfile.write("%s" % decode_hp.delimiter)
  outfile.flush()
  outfile.close()