Beispiel #1
0
    def testBuildDatasetFromSameFile(self):
        files = [
            utils.get_data_file('iwslt15.tst2013.100.envi'),
            utils.get_data_file('iwslt15.tst2013.100.envi'),
        ]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_corpus(
            [utils.get_data_file('iwslt15.tst2013.100.en')])
        y_tokenizer = SpaceTokenizer()
        y_tokenizer.build_from_corpus(
            [utils.get_data_file('iwslt15.tst2013.100.vi')])
        config = {
            'train_batch_size': 2,
            'predict_batch_size': 2,
            'eval_batch_size': 2,
            'buffer_size': 100
        }
        dataset = Seq2SeqDataset(x_tokenizer, y_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('iwslt15.tst2013.100.envi')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
    def testBuildDatasetFromSameFile(self):
        files = [utils.get_data_file('classify.seq.label.txt')]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_corpus(
            [utils.get_data_file('classify.seq.txt')])

        config = {
            'train_batch_size': 2,
            'eval_batch_size': 2,
            'predict_batch_size': 2,
            'buffer_size': 100
        }
        dataset = SeqClassifyDataset(x_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('classify.seq.txt')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
    def testBuildDatasetFromSameFile(self):
        files = [
            utils.get_data_file('dssm.query.doc.label.txt'),
            utils.get_data_file('dssm.query.doc.label.txt'),
        ]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt'))
        y_tokenizer = SpaceTokenizer()
        y_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt'))

        config = {
            'train_batch_size': 2,
            'eval_batch_size': 2,
            'predict_batch_size': 2,
            'buffer_size': 100,
        }
        dataset = SeqMatchDataset(x_tokenizer, y_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('dssm.query.doc.label.txt')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
 def testBuildFromVocab(self):
     print('============start build from vocab=============')
     tokenizer = SpaceTokenizer()
     tokenizer.build_from_vocab(data_dir_utils.get_data_file('vocab.test.txt'))
     print('token2id dict: ', tokenizer.token2id_dict)
     print('id2token dict: ', tokenizer.id2token_dict)
     words = tf.constant(['I', 'am', 'a', 'developer'])
     v0 = tokenizer.encode(words)
     print(v0)
     ids = tf.constant([1, 0, 2, 3, 4], dtype=tf.dtypes.int64)
     v1 = tokenizer.decode(ids)
     print(v1)
     print('============end build from vocab=============')
 def testSaveVocabFile(self):
     tokenizer = self.buildTokenizer()
     tokenizer.save_to_vocab(data_dir_utils.get_data_file('vocab.test.txt'))
     print(tokenizer.token2id_dict)
     print(tokenizer.id2token_dict)
 def buildTokenizer(self):
     tokenizer = SpaceTokenizer()
     corpus = ['iwslt15.tst2013.100.en']
     corpus = [data_dir_utils.get_data_file(f) for f in corpus]
     tokenizer.build_from_corpus(corpus, token_filters=[EmptyTokenFilter()])
     return tokenizer