Exemple #1
0
def _split_sentences(s1, s2):
  s1 = text_encoder.native_to_unicode(s1)
  s2 = text_encoder.native_to_unicode(s2)
  s1 = re.sub(r'(\w[A-Z]|[0-9a-z])([.!?]) ([A-Z])', r'\1\2__|__\3', s1)
  s2 = re.sub(r'([^0-9][.!?]) ([A-Z])', r'\1__|__\2', s2)
  s1_subsentences = s1.split('__|__')
  s2_subsentences = s2.split('__|__')
  return s1_subsentences, s2_subsentences
Exemple #2
0
def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
  """Compute BLEU for two files (reference and hypothesis translation)."""
  ref_lines = text_encoder.native_to_unicode(
      tf.gfile.Open(ref_filename, "r").read()).splitlines()
  hyp_lines = text_encoder.native_to_unicode(
      tf.gfile.Open(hyp_filename, "r").read()).splitlines()
  assert len(ref_lines) == len(hyp_lines)
  if not case_sensitive:
    ref_lines = [x.lower() for x in ref_lines]
    hyp_lines = [x.lower() for x in hyp_lines]
  ref_tokens = [bleu_tokenize(x) for x in ref_lines]
  hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
  return compute_bleu(ref_tokens, hyp_tokens)
def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
    """Compute BLEU for two files (reference and hypothesis translation)."""
    ref_lines = text_encoder.native_to_unicode(
        tf.gfile.Open(ref_filename, "r").read()).splitlines()
    hyp_lines = text_encoder.native_to_unicode(
        tf.gfile.Open(hyp_filename, "r").read()).splitlines()
    assert len(ref_lines) == len(hyp_lines)
    if not case_sensitive:
        ref_lines = [x.lower() for x in ref_lines]
        hyp_lines = [x.lower() for x in hyp_lines]
    ref_tokens = [bleu_tokenize(x) for x in ref_lines]
    hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
    return compute_bleu(ref_tokens, hyp_tokens)
Exemple #4
0
    def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False):
        if self.vocab_type != VocabType.SUBWORD:
            raise ValueError('Unsupported VocabType: %s' % self.vocab_type)

        vocab_filepath = os.path.join(data_dir, self.vocab_filename)

        if force_get or tf.gfile.Exists(vocab_filepath):
            tf.logging.info('Found vocab file: %s', vocab_filepath)
            return ModernMTSubwordTextEncoder(vocab_filepath)

        # Vocabulary file does not exist: generate vocabulary
        # --------------------------------------------------------------------------------------------------------------

        # Load token counts file if present (or generate if missing)
        tokens_filepath = os.path.join(tmp_dir, 'token_counts.dict')

        if tf.gfile.Exists(tokens_filepath):
            tf.logging.info('Found token counts file: %s', tokens_filepath)
            token_counts = self._load_token_counts(tokens_filepath)
        else:
            tf.logging.info('Generating token counts file: %s',
                            tokens_filepath)
            token_counts = defaultdict(int)

            for item in self.generate_text_for_vocab(data_dir, tmp_dir):
                for tok in text_encoder.native_to_unicode(item).split(u' '):
                    token_counts[tok] += 1

            self._save_token_counts(token_counts, tokens_filepath)

        # Build subword
        builder = SubwordTextEncoderBuilder(
            self.approx_vocab_size, custom_tokens=self._make_reserved_tokens())
        return builder.build(token_counts, vocab_filepath)
Exemple #5
0
def configure_job():
  """Construct jobSpec for ML Engine job."""
  # See documentation:
  # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
  training_input = {
      "pythonModule": "tensor2tensor.bin.t2t_trainer",
      "args": flags_as_args(),
      "region": text_encoder.native_to_unicode(cloud.default_region()),
      "runtimeVersion": RUNTIME_VERSION,
      "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7",
      "jobDir": FLAGS.output_dir,
      "scaleTier": "CUSTOM",
      "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type(
          num_gpus=FLAGS.worker_gpu)
  }
  if FLAGS.use_tpu:
    training_input["masterType"] = (FLAGS.cloud_mlengine_master_type or
                                    "standard")
    training_input["workerType"] = "cloud_tpu"
    training_input["workerCount"] = 1
  if FLAGS.hparams_range:
    tf.logging.info("Configuring hyperparameter tuning.")
    training_input["hyperparameters"] = configure_autotune(
        FLAGS.hparams_range,
        FLAGS.autotune_objective,
        FLAGS.autotune_maximize,
        FLAGS.autotune_max_trials,
        FLAGS.autotune_parallel_trials,
    )

  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp)
  job_spec = {"jobId": job_name, "trainingInput": training_input}
  return job_spec
Exemple #6
0
def _get_or_build_subword_text_encoder(tmp_dir):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
  Returns:
    a SubwordTextEncoder.
  """
    filepath = os.path.join(tmp_dir, "lm1b_32k.subword_text_encoder")
    if tf.gfile.Exists(filepath):
        return text_encoder.SubwordTextEncoder(filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
    ret.store_to_file(filepath)
    return ret
Exemple #7
0
def generator(tmp_dir, train, characters=False):
    """Generator for lm1b sentences.

  Args:
    tmp_dir: a string.
    train: a boolean.
    characters: a boolean

  Yields:
    A dictionary {"inputs": [0], "targets": [<subword ids>]}
  """
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    files = (_train_data_filenames(tmp_dir)
             if train else [_dev_data_filename(tmp_dir)])
    if characters:
        encoder = text_encoder.ByteTextEncoder()
    else:
        encoder = _get_or_build_subword_text_encoder(tmp_dir)
    for filepath in files:
        tf.logging.info("filepath = %s", filepath)
        for line in tf.gfile.Open(filepath):
            tokens = encoder.encode(
                _replace_oov(original_vocab,
                             text_encoder.native_to_unicode(line)))
            tokens.append(EOS)
            yield {"inputs": [0], "targets": tokens}
Exemple #8
0
def rank_reference_paragraphs(wiki_title, references_content, normalize=True):
  """Rank and return reference paragraphs by tf-idf score on title tokens."""
  normalized_title = _normalize_text(wiki_title)
  title_tokens = _tokens_to_score(
      set(tokenizer.encode(text_encoder.native_to_unicode(normalized_title))))
  ref_paragraph_info = []
  doc_counts = collections.defaultdict(int)
  for ref in references_content:
    for paragraph in ref.split("\n"):
      normalized_paragraph = _normalize_text(paragraph)
      if cc_utils.filter_paragraph(normalized_paragraph):
        # Skip paragraph
        continue
      counts = _token_counts(normalized_paragraph, title_tokens)
      for token in title_tokens:
        if counts[token]:
          doc_counts[token] += 1
      content = normalized_paragraph if normalize else paragraph
      info = {"content": content, "counts": counts}
      ref_paragraph_info.append(info)

  for info in ref_paragraph_info:
    score = 0.
    for token in title_tokens:
      term_frequency = info["counts"][token]
      inv_doc_frequency = (
          float(len(ref_paragraph_info)) / max(doc_counts[token], 1))
      score += term_frequency * math.log(inv_doc_frequency)
    info["score"] = score

  ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
  return [info["content"] for info in ref_paragraph_info]
def _rank_reference_paragraphs(wiki_title, references_content):
    """Rank and return reference paragraphs by tf-idf score on title tokens."""
    title_tokens = _tokens_to_score(
        set(tokenizer.encode(text_encoder.native_to_unicode(wiki_title))))
    ref_paragraph_info = []
    doc_counts = collections.defaultdict(int)
    for ref in references_content:
        for paragraph in ref.split("\n"):
            paragraph = _normalize_text(paragraph)
            if cc_utils.filter_paragraph(paragraph):
                # Skip paragraph
                continue
            counts = _token_counts(paragraph, title_tokens)
            for token in title_tokens:
                if counts[token]:
                    doc_counts[token] += 1
            info = {"content": paragraph, "counts": counts}
            ref_paragraph_info.append(info)

    for info in ref_paragraph_info:
        score = 0.
        for token in title_tokens:
            term_frequency = info["counts"][token]
            inv_doc_frequency = (float(len(ref_paragraph_info)) /
                                 max(doc_counts[token], 1))
            score += term_frequency * math.log(inv_doc_frequency)
        info["score"] = score

    ref_paragraph_info.sort(key=lambda el: el["score"], reverse=True)
    return [info["content"] for info in ref_paragraph_info]
def configure_job():
  """Construct jobSpec for ML Engine job."""
  # See documentation:
  # https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput
  training_input = {
      "pythonModule": "tensor2tensor.bin.t2t_trainer",
      "args": flags_as_args(),
      "region": text_encoder.native_to_unicode(default_region()),
      "runtimeVersion": RUNTIME_VERSION,
      "pythonVersion": "3.5" if sys.version_info.major == 3 else "2.7",
      "jobDir": FLAGS.output_dir,
      "scaleTier": "CUSTOM",
      "masterType": FLAGS.cloud_mlengine_master_type or get_default_master_type(
          num_gpus=FLAGS.worker_gpu)
  }
  if FLAGS.use_tpu:
    training_input["masterType"] = (FLAGS.cloud_mlengine_master_type or
                                    "standard")
    training_input["workerType"] = "cloud_tpu"
    training_input["workerCount"] = 1
  if FLAGS.hparams_range:
    tf.logging.info("Configuring hyperparameter tuning.")
    training_input["hyperparameters"] = configure_autotune(
        FLAGS.hparams_range,
        FLAGS.autotune_objective,
        FLAGS.autotune_maximize,
        FLAGS.autotune_max_trials,
        FLAGS.autotune_parallel_trials,
    )

  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
  job_name = "%s_%s_t2t_%s" % (FLAGS.model, FLAGS.problem, timestamp)
  job_spec = {"jobId": job_name, "trainingInput": training_input}
  return job_spec
def get_or_generate_vocab_es(tmp_dir, vocab_filename, vocab_size, datasets):
  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  print(vocab_filepath)
  if tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  sources = datasets
  tf.logging.info("Generating vocab from: %s", str(sources))
  token_counts = defaultdict(int)
  for source in sources:
    for lang_file in source[0]:
      tf.logging.info("Reading file: %s" % lang_file)
      filepath = os.path.join(tmp_dir, lang_file)
      print(filepath)

      # Use Tokenizer to count the word occurrences.
      with tf.gfile.GFile(filepath, mode="r") as source_file:
        file_byte_budget = 3.5e5 if "en" in filepath else 7e5
        for line in source_file:
          if file_byte_budget <= 0:
            break
          line = line.strip()
          file_byte_budget -= len(line)
          for tok in tokenizer.encode(text_encoder.native_to_unicode(line)):
            token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)
  vocab.store_to_file(vocab_filepath)
  return vocab
Exemple #12
0
  def generator(self, data_dir, tmp_dir, is_training):
    """Generator for lm1b sentences.

    Args:
      data_dir: data dir.
      tmp_dir: tmp dir.
      is_training: a boolean.

    Yields:
      A dictionary {"inputs": [0], "targets": [<subword ids>]}
    """
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    files = (_train_data_filenames(tmp_dir)
             if is_training else [_dev_data_filename(tmp_dir)])
    if self.is_character_level:
      encoder = text_encoder.ByteTextEncoder()
    else:
      vocab_filepath = os.path.join(data_dir, self.vocab_file)
      encoder = _get_or_build_subword_text_encoder(
          tmp_dir, vocab_filepath, self.targeted_vocab_size)
    for filepath in files:
      tf.logging.info("filepath = %s", filepath)
      for line in tf.gfile.Open(filepath):
        tokens = encoder.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        tokens.append(EOS)
        yield {"inputs": [0], "targets": tokens}
    def generator(self, data_dir, tmp_dir, is_training):
        """Generator for lm1b sentences.

    Args:
      data_dir: data dir.
      tmp_dir: tmp dir.
      is_training: a boolean.

    Yields:
      A dictionary {"inputs": [0], "targets": [<subword ids>]}
    """
        _maybe_download_corpus(tmp_dir)
        original_vocab = _original_vocab(tmp_dir)
        files = (_train_data_filenames(tmp_dir)
                 if is_training else [_dev_data_filename(tmp_dir)])
        if self.is_character_level:
            encoder = text_encoder.ByteTextEncoder()
        else:
            vocab_filepath = os.path.join(data_dir, self.vocab_file)
            encoder = _get_or_build_subword_text_encoder(
                tmp_dir, vocab_filepath, self.targeted_vocab_size)
        for filepath in files:
            tf.logging.info("filepath = %s", filepath)
            for line in tf.gfile.Open(filepath):
                tokens = encoder.encode(
                    _replace_oov(original_vocab,
                                 text_encoder.native_to_unicode(line)))
                tokens.append(EOS)
                yield {"inputs": [0], "targets": tokens}
def _token_counts(text, token_set=None):
    counts = collections.defaultdict(int)
    for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
        if token_set and token not in token_set:
            continue
        counts[token] += 1
    return counts
Exemple #15
0
def _token_counts(text, token_set=None):
  counts = collections.defaultdict(int)
  for token in tokenizer.encode(text_encoder.native_to_unicode(text)):
    if token_set and token not in token_set:
      continue
    counts[token] += 1
  return counts
def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size):
    """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
    if tf.gfile.Exists(vocab_filepath):
        return text_encoder.SubwordTextEncoder(vocab_filepath)
    _maybe_download_corpus(tmp_dir)
    original_vocab = _original_vocab(tmp_dir)
    token_counts = defaultdict(int)
    line_count = 0
    max_lines = 63000
    for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
        tokens = tokenizer.encode(
            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
        for tok in tokens:
            token_counts[tok] += 1
        line_count += 1
        if line_count >= max_lines:
            break
    if target_size == 2**15:
        # legacy behavior
        ret = text_encoder.SubwordTextEncoder()
        ret.build_from_token_counts(token_counts, min_count=5)
    else:
        ret = text_encoder.SubwordTextEncoder.build_to_target_size(
            target_size, token_counts, 1, 1000)
    ret.store_to_file(vocab_filepath)
    return ret
Exemple #17
0
    def encode_with_indexes(self, raw_text):
        tokens = text_encoder.native_to_unicode(raw_text).split(u' ')
        subtokens = self._tokens_to_subtoken_strings(tokens)
        subtoken_ids = [
            self._subtoken_string_to_id[subtoken] for subtoken in subtokens
        ]

        return subtoken_ids, self._get_indexes(subtokens)
Exemple #18
0
def generate_bpe_vocab(file_list, targeted_vocab_size):
    token_counts = defaultdict(int)
    for item in generator_fn(file_list):
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)
    return vocab
def get_or_generate_vocab(data_dir,
                          tmp_dir,
                          vocab_filename,
                          vocab_size,
                          sources=None):
    """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    sources = sources or _DATA_FILE_URLS
    tf.logging.info("Generating vocab from: %s", str(sources))
    token_counts = defaultdict(int)
    for source in sources:
        url = source[0]
        filename = os.path.basename(url)
        read_type = "r:gz" if "tgz" in filename else "r"

        compressed_file = maybe_download(tmp_dir, filename, url)

        with tarfile.open(compressed_file, read_type) as corpus_tar:
            corpus_tar.extractall(tmp_dir)

        for lang_file in source[1]:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            # For some datasets a second extraction is necessary.
            if ".gz" in lang_file:
                new_filepath = os.path.join(tmp_dir, lang_file[:-3])
                if tf.gfile.Exists(new_filepath):
                    tf.logging.info(
                        "Subdirectory %s already exists, skipping unpacking" %
                        filepath)
                else:
                    tf.logging.info("Unpacking subdirectory %s" % filepath)
                    gunzip_file(filepath, new_filepath)
                filepath = new_filepath

            # Use Tokenizer to count the word occurrences.
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                file_byte_budget = 3.5e5 if "en" in filepath else 7e5
                for line in source_file:
                    if file_byte_budget <= 0:
                        break
                    line = line.strip()
                    file_byte_budget -= len(line)
                    for tok in tokenizer.encode(
                            text_encoder.native_to_unicode(line)):
                        token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
def launch_job(job_spec):
  """Launch job on ML Engine."""
  project_id = "projects/{}".format(
      text_encoder.native_to_unicode(default_project()))
  credentials = GoogleCredentials.get_application_default()
  cloudml = discovery.build("ml", "v1", credentials=credentials,
                            cache_discovery=False)
  request = cloudml.projects().jobs().create(body=job_spec, parent=project_id)
  request.execute()
Exemple #21
0
def launch_job(job_spec):
  """Launch job on ML Engine."""
  project_id = "projects/{}".format(
      text_encoder.native_to_unicode(cloud.default_project()))
  credentials = GoogleCredentials.get_application_default()
  cloudml = discovery.build("ml", "v1", credentials=credentials,
                            cache_discovery=False)
  request = cloudml.projects().jobs().create(body=job_spec, parent=project_id)
  request.execute()
Exemple #22
0
    def _load_token_counts(filepath):
        token_counts = {}

        with tf.gfile.GFile(filepath, mode='rb') as tokens_file:
            for line in tokens_file:
                line = text_encoder.native_to_unicode(line.rstrip('\n'))
                count, token = line.split(u' ', 1)

                token_counts[token] = int(count)

        return token_counts
Exemple #23
0
def _normalize_string(raw_str):
  """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
  return ' '.join(
      token.strip()
      for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
Exemple #24
0
def _normalize_string(raw_str):
    """Normalizes the string using tokenizer.encode.

  Args:
    raw_str: the input string

  Returns:
   A string which is ready to be tokenized using split()
  """
    return " ".join(
        token.strip()
        for token in tokenizer.encode(text_encoder.native_to_unicode(raw_str)))
Exemple #25
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   del data_dir
   split_files = {
       problem.DatasetSplit.TRAIN: _train_data_filenames(tmp_dir),
       problem.DatasetSplit.EVAL: _dev_data_filenames(tmp_dir),
   }
   _maybe_download_corpus(tmp_dir)
   original_vocab = _original_vocab(tmp_dir)
   files = split_files[dataset_split]
   for filepath in files:
     tf.logging.info("filepath = %s", filepath)
     for line in tf.gfile.Open(filepath):
       txt = _replace_oov(original_vocab, text_encoder.native_to_unicode(line))
       yield {"targets": txt}
Exemple #26
0
 def generate_samples(self, data_dir, tmp_dir, dataset_split):
   del data_dir
   split_files = {
       problem.DatasetSplit.TRAIN: _train_data_filenames(tmp_dir),
       problem.DatasetSplit.EVAL: _dev_data_filenames(tmp_dir),
   }
   _maybe_download_corpus(tmp_dir)
   original_vocab = _original_vocab(tmp_dir)
   files = split_files[dataset_split]
   for filepath in files:
     tf.logging.info("filepath = %s", filepath)
     for line in tf.gfile.Open(filepath):
       txt = _replace_oov(original_vocab, text_encoder.native_to_unicode(line))
       yield {"targets": txt}
Exemple #27
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator_fn):
    """Inner implementation for vocab generators."""
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
Exemple #28
0
def generate_bpe_vocab(file_list, targeted_vocab_size):
    def generator_fn():
        for filepath in file_list:
            with tf.gfile.GFile(filepath, mode="r") as source_file:
                #file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
                for line in source_file:
                    #if file_byte_budget <= 0:
                    #    break
                    line = line.strip()
                    #file_byte_budget -= len(line)
                    yield line

    token_counts = defaultdict(int)
    for item in generator_fn():
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1
    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        targeted_vocab_size, token_counts, 1, 1e3)

    return vocab
Exemple #29
0
def _original_vocab(tmp_dir):
  """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
  vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
               "vocab-2016-09-10.txt")
  vocab_filename = os.path.basename(vocab_url)
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if not os.path.exists(vocab_filepath):
    generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
  return set(
      [text_encoder.native_to_unicode(l.strip()) for l in
       tf.gfile.Open(vocab_filepath)])
def get_or_generate_tabbed_vocab(data_dir, tmp_dir, source_filename, index,
                                 vocab_filename, vocab_size):
    r"""Generate a vocabulary from a tabbed source file.

  The source is a file of source, target pairs, where each line contains
  a source string and a target string, separated by a tab ('\t') character.
  The index parameter specifies 0 for the source or 1 for the target.

  Args:
    data_dir: path to the data directory.
    tmp_dir: path to the temporary directory.
    source_filename: the name of the tab-separated source file.
    index: index.
    vocab_filename: the name of the vocabulary file.
    vocab_size: vocabulary size.

  Returns:
    The vocabulary.
  """
    vocab_filepath = os.path.join(data_dir, vocab_filename)
    if os.path.exists(vocab_filepath):
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    # Use Tokenizer to count the word occurrences.
    token_counts = defaultdict(int)
    filepath = os.path.join(tmp_dir, source_filename)
    with tf.gfile.GFile(filepath, mode="r") as source_file:
        for line in source_file:
            line = line.strip()
            if line and "\t" in line:
                parts = line.split("\t", maxsplit=1)
                part = parts[index].strip()
                for tok in tokenizer.encode(
                        text_encoder.native_to_unicode(part)):
                    token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)
    vocab.store_to_file(vocab_filepath)
    return vocab
Exemple #31
0
def _original_vocab(tmp_dir):
  """Returns a set containing the original vocabulary.

  This is important for comparing with published results.

  Args:
    tmp_dir: directory containing dataset.

  Returns:
    a set of strings
  """
  vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
               "vocab-2016-09-10.txt")
  vocab_filename = os.path.basename(vocab_url + ".en")
  vocab_filepath = os.path.join(tmp_dir, vocab_filename)
  if not os.path.exists(vocab_filepath):
    generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
  return set([
      text_encoder.native_to_unicode(l.strip())
      for l in tf.gfile.Open(vocab_filepath)
  ])
def tar_and_copy_t2t(train_dir):
  """Tar Tensor2Tensor and cp to train_dir."""
  tf.logging.info("Tarring and pushing local Tensor2Tensor package.")

  output = text_encoder.native_to_unicode(shell_output(
      "pip show tensor2tensor")).split("\n")
  assert output[1].startswith("Version")
  assert output[7].startswith("Location")
  t2t_version = output[1].split(":")[1].strip()
  t2t_dir = output[7].split(":")[1].strip()

  # A local installation cloned from GitHub will have a setup.py file and a docs
  # folder
  is_local_t2t = all([
      tf.gfile.Exists(os.path.join(t2t_dir, fname))
      for fname in ["setup.py", "docs/cloud_mlengine.md"]
  ])

  if is_local_t2t:
    tf.logging.info("Found local T2T installation. Tarring directory %s",
                    t2t_dir)
  else:
    # PyPI installation
    # Create a folder with just a setup.py file pointing to the right version
    tf.logging.info("Found PyPI T2T installation. Launching tensor2tensor==%s",
                    t2t_version)
    t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp")
    shutil.rmtree(t2t_dir, ignore_errors=True)
    os.mkdir(t2t_dir)
    setup_fname = os.path.join(t2t_dir, "setup.py")
    setup_file_str = get_setup_file(
        name="DummyT2TPackage",
        packages=["tensor2tensor==%s" % t2t_version]
    )
    with tf.gfile.Open(setup_fname, "w") as f:
      f.write(setup_file_str)
  t2t_tar = _tar_and_copy(t2t_dir, train_dir)
  return t2t_tar
def tar_and_copy_t2t(train_dir):
    """Tar Tensor2Tensor and cp to train_dir."""
    tf.logging.info("Tarring and pushing local Tensor2Tensor package.")

    output = text_encoder.native_to_unicode(
        cloud.shell_output("pip show tensor2tensor")).split("\n")
    assert output[1].startswith("Version")
    assert output[7].startswith("Location")
    t2t_version = output[1].split(":")[1].strip()
    t2t_dir = output[7].split(":")[1].strip()

    # A local installation cloned from GitHub will have a setup.py file and a docs
    # folder
    is_local_t2t = all([
        tf.gfile.Exists(os.path.join(t2t_dir, fname))
        for fname in ["setup.py", "docs/cloud_mlengine.md"]
    ])

    if is_local_t2t:
        tf.logging.info("Found local T2T installation. Tarring directory %s",
                        t2t_dir)
    else:
        # PyPI installation
        # Create a folder with just a setup.py file pointing to the right version
        tf.logging.info(
            "Found PyPI T2T installation. Launching tensor2tensor==%s",
            t2t_version)
        t2t_dir = os.path.join(tempfile.gettempdir(), "tensor2tensor_tmp")
        shutil.rmtree(t2t_dir, ignore_errors=True)
        os.mkdir(t2t_dir)
        setup_fname = os.path.join(t2t_dir, "setup.py")
        setup_file_str = get_setup_file(
            name="DummyT2TPackage",
            packages=["tensor2tensor==%s" % t2t_version])
        with tf.gfile.Open(setup_fname, "w") as f:
            f.write(setup_file_str)
    t2t_tar = _tar_and_copy(t2t_dir, train_dir)
    return t2t_tar
Exemple #34
0
def _get_or_build_subword_text_encoder(tmp_dir,
                                       vocab_filepath,
                                       target_size):
  """Builds a SubwordTextEncoder based on the corpus.

  Args:
    tmp_dir: directory containing dataset.
    vocab_filepath: path to store (or load) vocab.
    target_size: an optional integer.

  Returns:
    a SubwordTextEncoder.
  """
  if tf.gfile.Exists(vocab_filepath):
    return text_encoder.SubwordTextEncoder(vocab_filepath)
  _maybe_download_corpus(tmp_dir)
  original_vocab = _original_vocab(tmp_dir)
  token_counts = defaultdict(int)
  line_count = 0
  max_lines = 63000
  for line in tf.gfile.Open(_train_data_filenames(tmp_dir)[0]):
    tokens = tokenizer.encode(
        _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
    for tok in tokens:
      token_counts[tok] += 1
    line_count += 1
    if line_count >= max_lines:
      break
  if target_size == 2 ** 15:
    # legacy behavior
    ret = text_encoder.SubwordTextEncoder()
    ret.build_from_token_counts(token_counts, min_count=5)
  else:
    ret = text_encoder.SubwordTextEncoder.build_to_target_size(
        target_size, token_counts, 1, 1000)
  ret.store_to_file(vocab_filepath)
  return ret
Exemple #35
0
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
    """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = defaultdict(int)
    for item in generator:
        for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
        vocab_size, token_counts, 1, 1e3)

    if vocab_filepath is not None:
        vocab.store_to_file(vocab_filepath)
    return vocab
def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                generator):
  """Inner implementation for vocab generators.

  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    vocab_filename: relative filename where vocab file is stored
    vocab_size: target size of the vocabulary constructed by SubwordTextEncoder
    generator: a generator that produces tokens from the vocabulary

  Returns:
    A SubwordTextEncoder vocabulary object.
  """
  if data_dir is None:
    vocab_filepath = None
  else:
    vocab_filepath = os.path.join(data_dir, vocab_filename)

  if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
    tf.logging.info("Found vocab file: %s", vocab_filepath)
    vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
    return vocab

  tf.logging.info("Generating vocab file: %s", vocab_filepath)
  token_counts = defaultdict(int)
  for item in generator:
    for tok in tokenizer.encode(text_encoder.native_to_unicode(item)):
      token_counts[tok] += 1

  vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
      vocab_size, token_counts, 1, 1e3)

  if vocab_filepath is not None:
    vocab.store_to_file(vocab_filepath)
  return vocab
 def test_native_to_unicode(self):
     s = r"foo bar"
     self.assertIsInstance(text_encoder.native_to_unicode(s), unicode)
     self.assertEqual(text_encoder.native_to_unicode(s), u"foo bar")
Exemple #38
0
 def encode(self, raw_text):
     return self._tokens_to_subtoken_ids(
         text_encoder.native_to_unicode(raw_text).split(u' '))
Exemple #39
0
def get_or_generate_vocab(data_dir,
                          tmp_dir,
                          vocab_filename,
                          vocab_size,
                          text_files,
                          mode='subword'):
    """Implementation for vocab generators.
  Args:
    data_dir: The base directory where data and vocab files are stored. If None,
        then do not save the vocab even if it doesn't exist.
    ...
    vocab_filename: relative filename where vocab file is stored
    vocab_size: None is accepted. target size of the vocabulary constructed by TextEncoder
    ...
  Returns:
    A TokenTextEncoder vocabulary object.
  """
    def generate():
        tf.logging.info("Generating vocab from: %s", str(text_files))
        for lang_file in text_files:
            tf.logging.info("Reading file: %s" % lang_file)
            filepath = os.path.join(tmp_dir, lang_file)

            with tf.gfile.GFile(filepath, mode="r") as source_file:
                for line in source_file:
                    line = line.strip()
                    yield line

    def encode(text):
        if mode == 'character':
            return list(text)
        else:
            return tokenizer.encode(text)

    def encoder(vocab_filepath):
        if mode == 'character':
            return CharacterTextEncoder(vocab_filepath, replace_oov="UNK")
        else:
            return text_encoder.SubwordTextEncoder(vocab_filepath)

    def build_and_save_vocab(vocab_filepath, vocab_size, token_counts):
        if mode == 'character':
            with tf.gfile.GFile(vocab_filepath, mode="w") as f:
                word_list = list(
                    map(lambda x: x[0], token_counts.most_common()))
                word_list = ['UNK'] + word_list
                if vocab_size is not None:
                    word_list = word_list[:vocab_size]
                for word in word_list:
                    f.write(word + '\n')

        else:
            text_encoder.SubwordTextEncoder.build_to_target_size(
                vocab_size, token_counts, 1, 1e3).store_to_file(vocab_filepath)

    if data_dir is None:
        vocab_filepath = None
    else:
        vocab_filepath = os.path.join(data_dir, vocab_filename)

    if vocab_filepath is not None and tf.gfile.Exists(vocab_filepath):
        tf.logging.info("Found vocab file: %s", vocab_filepath)
        vocab = encoder(vocab_filepath)
        return vocab

    tf.logging.info("Generating vocab file: %s", vocab_filepath)
    token_counts = Counter()
    for item in generate():
        for tok in encode(text_encoder.native_to_unicode(item)):
            token_counts[tok] += 1

    build_and_save_vocab(vocab_filepath, vocab_size, token_counts)
    vocab = encoder(vocab_filepath)

    return vocab
Exemple #40
0
 def test_native_to_unicode(self):
     s = r"foo bar"
     s_unicode = text_encoder.native_to_unicode(s)
     if six.PY2:
         self.assertIsInstance(s_unicode, unicode)
     self.assertEqual(s_unicode, u"foo bar")
 def test_native_to_unicode(self):
   s = r"foo bar"
   s_unicode = text_encoder.native_to_unicode(s)
   if six.PY2:
     self.assertIsInstance(s_unicode, unicode)
   self.assertEqual(s_unicode, u"foo bar")