def testGetOrGenerateTxtVocab(self):
    data_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
    test_file = os.path.join(self.get_temp_dir(), "test.txt")
    with tf.gfile.Open(test_file, "w") as outfile:
      outfile.write("a b c\n")
      outfile.write("d e f\n")
    # Create a vocab over the test file.
    vocab1 = generator_utils.get_or_generate_txt_vocab(
        data_dir, "test.voc", 20, test_file)
    self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
    self.assertIsNotNone(vocab1)

    # Append a new line to the test file which would change the vocab if
    # the vocab were not being read from file.
    with tf.gfile.Open(test_file, "a") as outfile:
      outfile.write("g h i\n")
    vocab2 = generator_utils.get_or_generate_txt_vocab(
        data_dir, "test.voc", 20, test_file)
    self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
    self.assertIsNotNone(vocab2)
    self.assertEqual(vocab1.dump(), vocab2.dump())
Ejemplo n.º 2
0
    def testGetOrGenerateTxtVocab(self):
        data_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
        test_file = os.path.join(self.get_temp_dir(), "test.txt")
        with tf.gfile.Open(test_file, "w") as outfile:
            outfile.write("a b c\n")
            outfile.write("d e f\n")
        # Create a vocab over the test file.
        vocab1 = generator_utils.get_or_generate_txt_vocab(
            data_dir, "test.voc", 20, test_file)
        self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
        self.assertIsNotNone(vocab1)

        # Append a new line to the test file which would change the vocab if
        # the vocab were not being read from file.
        with tf.gfile.Open(test_file, "a") as outfile:
            outfile.write("g h i\n")
        vocab2 = generator_utils.get_or_generate_txt_vocab(
            data_dir, "test.voc", 20, test_file)
        self.assertTrue(tf.gfile.Exists(os.path.join(data_dir, "test.voc")))
        self.assertIsNotNone(vocab2)
        self.assertEqual(vocab1.dump(), vocab2.dump())
Ejemplo n.º 3
0
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_txt_vocab(
         data_dir,
         self.vocab_file,
         self.targeted_vocab_size,
         filepatterns=[ENDE_TRAIN_TOK_SRC, ENDE_TRAIN_TOK_TRG])
     if train:
         data_src = ENDE_TRAIN_TOK_SR
         data_trg = ENDE_TRAIN_TOK_TRG
     else:
         data_src = ENDE_DEV_TOK_SRC
         data_trg = ENDE_DEV_TOK_TRG
     return token_generator(data_src, data_trg, symbolizer_vocab, EOS)
    def generator(self, data_dir, tmp_dir, train):
        datasets = _ENKR_SUBTITLE_TRAIN_DATASETS if train else _ENKR_SUBTITLE_TEST_DATASETS
        source_datasets = [
            os.path.join(data_dir, path[1][0]) for path in datasets
        ]
        target_datasets = [
            os.path.join(data_dir, path[1][1]) for path in datasets
        ]

        source_vocab = generator_utils.get_or_generate_txt_vocab(
            data_dir, self.source_vocab_name, self.targeted_vocab_size,
            source_datasets)
        target_vocab = generator_utils.get_or_generate_txt_vocab(
            data_dir, self.target_vocab_name, self.targeted_vocab_size,
            target_datasets)

        tag = "train" if train else "dev"
        data_path = compile_data_from_txt(tmp_dir, datasets,
                                          "zero_shot_enkrch_tok_%s" % tag)

        return translate.bi_vocabs_token_generator(data_path + ".lang1",
                                                   data_path + ".lang2",
                                                   source_vocab, target_vocab,
                                                   EOS)