def _makeToyTaggerData(self): data_config = {} features_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["M . Smith went to Washington .", "I live in New Zealand ."], ) labels_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "labels.txt"), ["B-PER I-PER E-PER O O S-LOC O", "O O O B-LOC E-LOC O"], ) data_config["source_vocabulary"] = test_util.make_vocab_from_file( os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file) data_config["target_vocabulary"] = test_util.make_data_file( os.path.join(self.get_temp_dir(), "labels_vocab.txt"), [ "O", "B-LOC", "I-LOC", "E-LOC", "S-LOC", "B-PER", "I-PER", "E-PER", "S-PER", ], ) return features_file, labels_file, data_config
def _makeToyEnDeData(self, with_alignments=False): data_config = {} features_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["Parliament Does Not Support Amendment Freeing Tymoshenko", "Today , the Ukraine parliament dismissed , within the Code of Criminal Procedure " "amendment , the motion to revoke an article based on which the opposition leader , " "Yulia Tymoshenko , was sentenced .", "The amendment that would lead to freeing the imprisoned former Prime Minister was " "revoked during second reading of the proposal for mitigation of sentences for " "economic offences ."]) labels_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "tgt.txt"), ["Keine befreiende Novelle für Tymoshenko durch das Parlament", "Das ukrainische Parlament verweigerte heute den Antrag , im Rahmen einer Novelle " "des Strafgesetzbuches denjenigen Paragrafen abzuschaffen , auf dessen Grundlage die " "Oppositionsführerin Yulia Timoshenko verurteilt worden war .", "Die Neuregelung , die den Weg zur Befreiung der inhaftierten Expremierministerin hätte " "ebnen können , lehnten die Abgeordneten bei der zweiten Lesung des Antrags auf Milderung " "der Strafen für wirtschaftliche Delikte ab ."]) data_config["source_vocabulary"] = test_util.make_vocab_from_file( os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file) data_config["target_vocabulary"] = test_util.make_vocab_from_file( os.path.join(self.get_temp_dir(), "tgt_vocab.txt"), labels_file) if with_alignments: # Dummy and incomplete alignments. data_config["train_alignments"] = test_util.make_data_file( os.path.join(self.get_temp_dir(), "aligne.txt"), ["0-0 1-0 2-2 3-4 4-4 5-6", "0-1 1-1 1-3 2-3 4-4", "0-0 1-0 2-2 3-4 4-4 5-6"]) return features_file, labels_file, data_config
def testSequenceToSequenceInputter(self): source_vocabulary = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src_vocab.txt"), ["<blank>", "<s>", "</s>", "a", "b", "c", "d"], ) target_vocabulary = test_util.make_data_file( os.path.join(self.get_temp_dir(), "tgt_vocab.txt"), ["<blank>", "<s>", "</s>", "e", "f", "g", "h"], ) source_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["a c c", "b d", "a e"]) target_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "tgt.txt"), ["f h g", "e h", "a e"]) inputter = sequence_to_sequence.SequenceToSequenceInputter( text_inputter.WordEmbedder(embedding_size=20), text_inputter.WordEmbedder(embedding_size=20), ) inputter.initialize( dict(source_vocabulary=source_vocabulary, target_vocabulary=target_vocabulary)) dataset = inputter.make_dataset([source_file, target_file]) element = iter(dataset).next() features, labels = inputter.make_features(element) self.assertIn("ids_out", labels) self.assertAllEqual(labels["ids"], [1, 4, 6, 5]) self.assertAllEqual(labels["ids_out"], [4, 6, 5, 2]) self.assertEqual(labels["length"], 4)
def _makeToyClassifierData(self): data_config = {} features_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["This product was not good at all , it broke on the first use !", "Perfect , it does everything I need .", "How do I change the battery ?"]) labels_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "labels.txt"), ["negative", "positive", "neutral"]) data_config["source_vocabulary"] = test_util.make_vocab_from_file( os.path.join(self.get_temp_dir(), "src_vocab.txt"), features_file) data_config["target_vocabulary"] = test_util.make_data_file( os.path.join(self.get_temp_dir(), "labels_vocab.txt"), ["negative", "positive", "neutral"]) return features_file, labels_file, data_config
def testDatasetSize(self): path = test_util.make_data_file( os.path.join(self.get_temp_dir(), "file.txt"), list(map(str, range(15)))) dataset = tf.data.TextLineDataset(path) size = dataset_util.get_dataset_size(dataset) self.assertEqual(self.evaluate(size), 15)
def testTrainLanguageModel(self): src = test_util.make_data_file( os.path.join(self.get_temp_dir(), "src.txt"), ["1 2 3 4", "5 6 7 8 9", "3 2"]) vocab = test_util.make_vocab( os.path.join(self.get_temp_dir(), "vocab.txt"), list(map(str, range(10)))) config = { "data": { "train_features_file": src, "vocabulary": vocab, }, "params": { "learning_rate": 0.0005, "optimizer": "Adam" }, "train": { "batch_size": 10, "max_step": 2, }, } model = models.LanguageModel(decoders.SelfAttentionDecoder( 2, num_units=32, ffn_inner_dim=32), embedding_size=16, reuse_embedding=False) runner = Runner(model, config) runner.train()
def _makeTransliterationData(self): ar = [ "آ ت ز م و ن", "آ ت ش ي س و ن", "آ ر ب ا ك ه", "آ ر ث ر", "آ ز ا", ] en = ["a t z m o n", "a c h e s o n", "a a r b a k k e", "a r t h u r", "a s a"] ar_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "ar.txt"), ar ) en_file = test_util.make_data_file( os.path.join(self.get_temp_dir(), "en.txt"), en ) return ar_file, en_file
def _make_model(name, src_vocab, tgt_vocab, random_slots=False): model, _ = _seq2seq_model(training=True) optimizer = tf.keras.optimizers.Adam() data = {} data["source_vocabulary"] = test_util.make_data_file( os.path.join(self.get_temp_dir(), "%s-src-vocab.txt" % name), src_vocab) data["target_vocabulary"] = test_util.make_data_file( os.path.join(self.get_temp_dir(), "%s-tgt-vocab.txt" % name), tgt_vocab) model.initialize(data) model.create_variables(optimizer=optimizer) if random_slots: for variable in model.trainable_variables: for slot_name in optimizer.get_slot_names(): slot = optimizer.get_slot(variable, slot_name) slot.assign(tf.random.uniform(slot.shape)) return model, optimizer
def testLoadSentencePieceVocab(self): vocab_path = test_util.make_data_file( os.path.join(self.get_temp_dir(), "vocab_sp"), [ "<unk> 0", "<s> 0", "</s> 0", ", -3.0326", ". -3.41093", "▁the -3.85169", "s -4.05468", "▁die -4.15914", "▁in -4.2419", "▁der -4.36135" ]) vocab = Vocab(from_file=vocab_path, from_format="sentencepiece") self.assertEqual(len(vocab), 7) self.assertNotIn("<unk>", vocab) self.assertNotIn("<s>", vocab) self.assertNotIn("</s>", vocab) self.assertIn("▁the", vocab)
def _create_dataset(model, temp_dir): data_path = os.path.join(temp_dir, "data.txt") test_util.make_data_file(data_path, ["a a a b b d", "a b b b", "c c"]) dataset = model.examples_inputter.make_inference_dataset(data_path, 1) return dataset
def _run_scorer(self, scorer, refs, hyps): ref_path = test_util.make_data_file( os.path.join(self.get_temp_dir(), "ref.txt"), refs) hyp_path = test_util.make_data_file( os.path.join(self.get_temp_dir(), "hyp.txt"), hyps) return scorer(ref_path, hyp_path)