Exemple #1
0
    def _get_data(self, bidirectional, use_chars, test=False):
        vocab_file = os.path.join(FIXTURES, 'vocab.txt')
        if use_chars:
            vocab = load_vocab(vocab_file, 10)
        else:
            vocab = load_vocab(vocab_file, None)

        prefix = os.path.join(FIXTURES, 'data.txt')

        if bidirectional:
            data = BidirectionalLMDataset(prefix, vocab, test=test)
        else:
            data = LMDataset(prefix, vocab, test=test, reverse=False)

        return data, vocab
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options, data, args.n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)
Exemple #3
0
def main(args):
    # load the vocab
    #todo 1.0 加载vocabulary数据集
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    #todo 2.0 构造数据集
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #4
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
Exemple #5
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, args.maxchar)

    # define the options
    batch_size = 192  # batch size for each GPU
    n_gpus = 2

    # number of tokens in training data
    n_train_tokens = args.size

    options = {
        "bidirectional": True,
        "char_cnn": {
            "activation":
            "relu",
            "embedding": {
                "dim": 16
            },
            "filters": [
                [1, 32],
                [2, 32],
                [3, 64],
                [4, 128],
                [5, 256],
                [6, 512],
                [7, 1024],
            ],
            "max_characters_per_token":
            args.maxchar,
            "n_characters":
            261,
            "n_highway":
            2,
        },
        "dropout": 0.1,
        "lstm": {
            "cell_clip": 3,
            "dim": 2048,
            "n_layers": 2,
            "proj_clip": 3,
            "projection_dim": 512,
            "use_skip_connections": True,
        },
        "all_clip_norm_val": 10.0,
        "n_epochs": args.epochs,
        "n_train_tokens": n_train_tokens,
        "batch_size": batch_size,
        "n_tokens_vocab": vocab.size,
        "unroll_steps": 20,
        "n_negative_samples_batch": 4096,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)