def testBuildDatasetFromSameFile(self):
        files = [utils.get_data_file('classify.seq.label.txt')]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_corpus(
            [utils.get_data_file('classify.seq.txt')])

        config = {
            'train_batch_size': 2,
            'eval_batch_size': 2,
            'predict_batch_size': 2,
            'buffer_size': 100
        }
        dataset = SeqClassifyDataset(x_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('classify.seq.txt')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
Exemple #2
0
    def testBuildDatasetFromSameFile(self):
        files = [
            utils.get_data_file('iwslt15.tst2013.100.envi'),
            utils.get_data_file('iwslt15.tst2013.100.envi'),
        ]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_corpus(
            [utils.get_data_file('iwslt15.tst2013.100.en')])
        y_tokenizer = SpaceTokenizer()
        y_tokenizer.build_from_corpus(
            [utils.get_data_file('iwslt15.tst2013.100.vi')])
        config = {
            'train_batch_size': 2,
            'predict_batch_size': 2,
            'eval_batch_size': 2,
            'buffer_size': 100
        }
        dataset = Seq2SeqDataset(x_tokenizer, y_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('iwslt15.tst2013.100.envi')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
    def testBuildDatasetFromSameFile(self):
        files = [
            utils.get_data_file('dssm.query.doc.label.txt'),
            utils.get_data_file('dssm.query.doc.label.txt'),
        ]
        x_tokenizer = SpaceTokenizer()
        x_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt'))
        y_tokenizer = SpaceTokenizer()
        y_tokenizer.build_from_vocab(utils.get_data_file('dssm.vocab.txt'))

        config = {
            'train_batch_size': 2,
            'eval_batch_size': 2,
            'predict_batch_size': 2,
            'buffer_size': 100,
        }
        dataset = SeqMatchDataset(x_tokenizer, y_tokenizer, config)

        train_dataset = dataset.build_train_dataset(files)
        print(next(iter(train_dataset)))
        print('=' * 120)

        eval_dataset = dataset.build_eval_dataset(files)
        print(next(iter(eval_dataset)))
        print('=' * 120)

        predict_files = [utils.get_data_file('dssm.query.doc.label.txt')]
        predict_dataset = dataset.build_predict_dataset(predict_files)
        print(next(iter(predict_dataset)))
        print('=' * 120)
 def testBuildFromVocab(self):
     print('============start build from vocab=============')
     tokenizer = SpaceTokenizer()
     tokenizer.build_from_vocab(data_dir_utils.get_data_file('vocab.test.txt'))
     print('token2id dict: ', tokenizer.token2id_dict)
     print('id2token dict: ', tokenizer.id2token_dict)
     words = tf.constant(['I', 'am', 'a', 'developer'])
     v0 = tokenizer.encode(words)
     print(v0)
     ids = tf.constant([1, 0, 2, 3, 4], dtype=tf.dtypes.int64)
     v1 = tokenizer.decode(ids)
     print(v1)
     print('============end build from vocab=============')
 def buildTokenizer(self):
     tokenizer = SpaceTokenizer()
     corpus = ['iwslt15.tst2013.100.en']
     corpus = [data_dir_utils.get_data_file(f) for f in corpus]
     tokenizer.build_from_corpus(corpus, token_filters=[EmptyTokenFilter()])
     return tokenizer
Exemple #6
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import os

import tensorflow as tf
from easylib.dl import KerasModelDatasetRunner
from nlp_datasets.abstract_dataset import AbstractXYDataset
from nlp_datasets.tokenizers import SpaceTokenizer
from nlp_datasets.xyz_dataset import XYZSameFileDataset

from mp import models, utils

tokenizer = SpaceTokenizer()
tokenizer.build_from_vocab(os.path.join(utils.testdat_dir(), 'vocab.txt'))
config = {
    'x_max_len': 1000,
    'y_max_len': 1000,
    'train_batch_size': 1,
    'predict_batch_size': 32,
    'shuffle_size': -1,
    'num_parallel_calls': tf.data.experimental.AUTOTUNE
}
dataset = XYZSameFileDataset(x_tokenizer=tokenizer,
                             y_tokenizer=tokenizer,
                             config=config)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
Exemple #7
0
        'vocab_size': 10,
        'embedding_size': 256,
        'vec_dim': 256,
    }
    runner_config = {
        'ckpt_period': 1,
        'model_dir': '/tmp/dssm'
    }
    config.update(dataset_config)
    config.update(model_config)
    config.update(runner_config)

    if not os.path.exists(config['model_dir']):
        os.makedirs(config['model_dir'])

    tokenizer = SpaceTokenizer()
    tokenizer.build_from_vocab(config['vocab_file'])
    logging.info('Build tokenizer from vocab file: %s' % config['vocab_file'])
    logging.info('vocab size of tokenizer: %d' % tokenizer.vocab_size)
    config['vocab_size'] = tokenizer.vocab_size

    args, _ = parser.parse_known_args()
    if 'mlp' == args.model:
        model = models.build_mlp_model(config)
    elif 'lstm' == args.model:
        model = models.build_lstm_model(config)
    else:
        raise ValueError('Invalid model: %s' % args.model)

    dataset = XYZSameFileDataset(x_tokenizer=tokenizer, y_tokenizer=tokenizer, config=None)