Ejemplo n.º 1
0
    def _get_data(self, bidirectional, use_chars, test=False):
        vocab_file = os.path.join(FIXTURES, 'vocab.txt')
        if use_chars:
            vocab = load_vocab(vocab_file, 10)
        else:
            vocab = load_vocab(vocab_file, None)

        prefix = os.path.join(FIXTURES, 'data.txt')

        if bidirectional:
            data = BidirectionalLMDataset(prefix, vocab, test=test)
        else:
            data = LMDataset(prefix, vocab, test=test, reverse=False)

        return data, vocab
Ejemplo n.º 2
0
    def _get_data(self, bidirectional, use_chars, test=False):
        vocab_file = os.path.join(FIXTURES, 'vocab.txt')
        if use_chars:
            vocab = load_vocab(vocab_file, 10)
        else:
            vocab = load_vocab(vocab_file, None)

        prefix = os.path.join(FIXTURES, 'data.txt')

        if bidirectional:
            data = BidirectionalLMDataset(prefix, vocab, test=test)
        else:
            data = LMDataset(prefix, vocab, test=test, reverse=False)

        return data, vocab
Ejemplo n.º 3
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options, data, args.n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)
Ejemplo n.º 4
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    permute_number = options.get('permute_number', 4)

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    elif options.get('multidirectional'):
        data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
Ejemplo n.º 5
0
def main(train_prefix, vocab_file, save_dir):
    # load the vocab
    vocab = load_vocab(vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768_648_884
    n_train_tokens = 1_246_091

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation': 'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [
                [1, 32],
                [2, 32],
                [3, 64],
            ],
            #[4, 128],
            #[5, 256],
            #[6, 512],
            #[7, 1024]],
            'max_characters_per_token': 50,
            'n_characters': 261,
            'n_highway': 2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            #'projection_dim': 512,
            'projection_dim': 64,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 2,  #10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = save_dir
    tf_log_dir = save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 6
0
def main(args):
    print(args)
    print('-' * 100)
    print('Loading models and options...')
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    print('Loading vocabulary...')
    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    shards = glob(args.test_prefix)
    shards.sort()
    # print(shards)
    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }
    print(f'Building dataset...')
    datasets = []
    for shard in shards:
        if options.get('bidirectional'):
            datasets.append(BidirectionalLMDataset(shard, vocab, **kwargs))
        else:
            datasets.append(LMDataset(shard, vocab, **kwargs))

    print('Predicting...')
    tag(options, ckpt_file, shards, datasets, batch_size=args.batch_size)
    print('-' * 100)
    print('done.')
Ejemplo n.º 7
0
def bilm_predict():
    options, ckpt_file = load_options_latest_checkpoint('dump/bilm_pretrain')
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    vocab = load_vocab('dump/bilm_pretrain/vocab-2016-09-10.txt',
                       max_word_length)
    test_prefix = '../../deps/bilm-tf/tests/fixtures/train/data.txt'

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    perplexity = test(options, ckpt_file, data, batch_size=1)
    from IPython import embed
    embed()
    import os
    os._exit(1)
Ejemplo n.º 8
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = args.batch_size  # batch size for each GPU
    n_gpus = args.n_gpus

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.ntokens

    n_negative_samples_batch = 8192
    if n_negative_samples_batch > vocab.size:
        n_negative_samples_batch = int(vocab.size / 2)

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': args.n_epochs,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': n_negative_samples_batch,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    print("NGPUS in train_elmo: %i" % (n_gpus, ))
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 9
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    # vocab = load_vocab(args.vocab_file, max_word_length)
    vocab = load_vocab(args.vocab_file, args.stroke_vocab_file,
                       50)  # Winfred stroke_vocab

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
Ejemplo n.º 10
0
def main(args):

    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=True)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
Ejemplo n.º 11
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 2

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768

    options = {
        'bidirectional': True,

        # 'char_cnn': {'activation': 'relu',
        #              'embedding': {'dim': 16},  # 每个字符的embedding表示维数
        #              'filters': [
        #                  [1, 32],
        #                  [2, 32],
        #                  [3, 64],
        #                  [4, 128],
        #                  [5, 256]
        #                  # [6, 512],
        #                  # [7, 1024]
        #              ],
        #              'max_characters_per_token': 50,  # 每个单词最大字符数
        #              'n_characters': 300000,  # 字符字典中总的字符个数,就60个?
        #              'n_highway': 2},  # 使用high way网络
        'dropout': 0.1,
        'lstm': {
            'cell_clip':
            3,  # if provided the cell state is clipped by this value prior to the cell output activation.
            'dim': 4096,  # 隐藏层神经元个数
            'n_layers': 2,
            'proj_clip': 3,
            # If num_proj > 0 and proj_clip is provided, then the projected values are clipped elementwise to within [-proj_clip, proj_clip].
            'projection_dim':
            512,  # num_proj 投影矩阵的输出维数。 如果为None,则不执行投影。#最终维度,投影层维度
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 1,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,  # 输入句子的长度#最大时长,n_token
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    # print("",)
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 12
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 64  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            262,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 600,  # 4096
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 300,  # 512
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 1,  # 10
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 50,
        'bow_size': 50,
        'n_negative_samples_batch': 1024,  # 8192
    }

    prefix = args.train_prefix
    data = WikiLinkDataset(vocab=vocab,
                           filepattern=prefix,
                           path2ent2def=args.desc_path,
                           num_steps=options['unroll_steps'],
                           bow_size=options['bow_size'])

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir,
          args.restart_checkpoint)
Ejemplo n.º 13
0
def main(args):
    ckpt_file = None
    if os.path.exists(args.save_dir+'options.json'):
        options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = args.n_gpus

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir,
          restart_ckpt_file=ckpt_file)
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, None)

    # define the options
    batch_size = 512  # batch size for each GPU
    n_gpus = 3
    os.environ['CUDA_VISIBLE_DEVICES'] = '1, 2, 6'

    # number of tokens in training data (this for 1B Word Benchmark)
    # word 8799
    # char 2355
    n_train_tokens = 768648884
    # n_train_tokens = 8799

    options = {
        'bidirectional': True,

        #  'char_cnn': {'activation': 'relu',
        #  'embedding': {'dim': 16},
        #  'filters': [[1, 32],
        #  [2, 32],
        #  [3, 64],
        #  [4, 128],
        #  [5, 256],
        #  [6, 512],
        #  [7, 1024]],
        #  'max_characters_per_token': 50,
        #  'n_characters': 261,
        #  'n_highway': 2},
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 1024,
    }

    print('vocab_size:', vocab.size)
    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 15
0
def main(args):
    vocab = load_vocab(args.vocab_file, args.vocab_min_occur)
    train_tokens = 768648884  #(this for 1B Word Benchmark)
    if args.train_tokens == 'wikitext2':
        train_tokens = 2051910  #Enwiki2
    elif args.train_tokens == 'wikitext103':
        train_tokens = 101425658  #wikitext-103
    if args.is_line:
        train_tokens *= 3
    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': train_tokens,
        'batch_size': args.train_batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    train_data = BidirectionalLMDataset(prefix,
                                        vocab,
                                        test=False,
                                        shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          train_data,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          converge=args.converge)
Ejemplo n.º 16
0
def main(args):
    # load the vocab
    # vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(args.vocab_file, None)  # load_vocab的第二个参数应该改为None

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 2
    os.environ[
        'CUDA_VISIBLE_DEVICES'] = '4,5'  # n_gpus CUDA_VISIBLE_DEVICES 根据自己需求改

    # number of tokens in training data (this for 1B Word Benchmark)
    # n_train_tokens 可改可不改,影响的是输出信息。要查看自己语料的行数,可以通过wc -l corpus.txt 查看。
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,

        #     'char_cnn': {'activation': 'relu',  # option的修改,将char_cnn部分都注释掉,其他根据自己需求修改
        #      'embedding': {'dim': 16},
        #      'filters': [[1, 32],
        #       [2, 32],
        #       [3, 64],
        #       [4, 128],
        #       [5, 256],
        #       [6, 512],
        #       [7, 1024]],
        #      'max_characters_per_token': 50,
        #      'n_characters': 261,
        #      'n_highway': 2},
        #
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 300,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 300,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 3,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 256,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 17
0
def top_level(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None

    vocab = load_vocab(os.path.join(args.save_dir, "vocabs.txt"),
                       max_word_length)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs to overide the otpions.json
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size
    if args.n_gpus > 0:
        n_gpus = args.n_gpus
    else:
        n_gpus = options['n_gpus']

    # load train_prefixes
    #if args.train_prefix_paths != None:
    if False:
        with open(args.train_prefix_paths, "r") as fd:
            train_prefixes = fd.read().split('\n')
        train_prefixes = [f for f in train_prefixes if f != ""]
        options['train_prefix_paths'] = train_prefixes
        start = 0
    else:
        train_prefixes = options['train_prefix_paths']
        start = options['milestone']

    if start >= len(train_prefixes):
        print(
            "WARNING: Finish all train_prefix_paths. Reset milestone in options."
        )
        sys.exit(0)

    # loop all train_prefix_paths
    milestone = start
    for train_prefix in train_prefixes[start:]:
        prefix = train_prefix + '/*'

        if args.n_train_tokens > 0:
            options['n_train_tokens'] = args.n_train_tokens
        else:
            options['n_train_tokens'] = get_tokens_count(prefix)

        resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir,
               ckpt_file)
        milestone += 1
        options['milestone'] = milestone
        save_options(options, os.path.join(args.save_dir, "options.json"))
Ejemplo n.º 18
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for indonesia wikidump)
    n_train_tokens = 25766422

    options = {
     'bidirectional': True,
     'char_cnn': {
        'activation': 'relu',
        'embedding': {'dim': 16},
        'filters': [[1, 32],
        [2, 32],
        [3, 64],
        [4, 128],
        [5, 256],
        [6, 512],
        [7, 1024]],
        'max_characters_per_token': 50,
        'n_characters': 261,
        'n_highway': 1
      },
    
     'dropout': 0.1,
    
     'lstm': {
      'use_skip_connections': True,
      'projection_dim': 128,
      'cell_clip': 3,
      'proj_clip': 3,
      'dim': 1024,
      'n_layers': 2
      },
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 4,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192
     #'n_negative_samples_batch': 1024,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 19
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    #使用GPU的数量
    n_gpus = 2
    #设置在哪两个GPU上运行,它是并行的
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
    #训练语料中的词数,是不去重的,这会影响到训练的时间,需要结合自己的训练语料修改
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,
        #中文的去掉
        # 'char_cnn': {'activation': 'relu',
        #  'embedding': {'dim': 16},
        #  'filters': [[1, 32],
        #   [2, 32],
        #   [3, 64],
        #   [4, 128],
        #   [5, 256],
        #   [6, 512],
        #   [7, 1024]],
        #  'max_characters_per_token': 50,
        #  'n_characters': 261,
        #  'n_highway': 2},
        'dropout': 0.1,
        #设置的LSTM的参数,可以修改
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        #一个批次负采样的个数,语料过短时需要修改,修改小点
        'n_negative_samples_batch': 20,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 20
0
def main(args):
    # load the vocab
    vocab_file = os.path.join(args.folder, 'vocabulary.txt')
    vocab = load_vocab(vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = args.gpu

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768648884
    n_train_tokens = args.tokens

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': args.size
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': args.epoch,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = os.path.join(args.folder, 'corpus', '*')
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = os.path.join(args.folder, args.checkpoint)
    tf_log_dir = tf_save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 21
0
def main(args):
    # load the vocab
    max_token_length = 7  #一个词的长度
    vocab = load_vocab(args.vocab_file, max_token_length)

    # define the options
    batch_size = 16  # batch size for each GPU
    n_gpus = 6

    # number of tokens in training data (this for 1B Word Benchmark)
    # n_train_tokens = 768648884
    n_train_tokens = 94268535  #训练集上共有多少个词

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            7,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 20,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 22
0
def main(args):

    if args.gpu is not None:
        if ',' in args.gpu:
            args.gpu = args.gpu.split(',')
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    if 'polyglot' in options or args.polyglot:
        polyglot = True
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=polyglot)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        if 'polyglot' in options or args.polyglot:
            data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options,
          data,
          None,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
Ejemplo n.º 23
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)  #50 is max word length

    # define the options
    batch_size = 200  #TODO: batch size for each GPU.
    n_gpus = 1  #TODO: how many gpus do you  have?

    # number of tokens in training data
    n_train_tokens = 198782  #TODO: update this number to be the total number of tokens in your training data

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,  #TODO: update this to how many epochs you want to run
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)
    restart_ckpt_file = args.restart_ckpt_file

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},

     'dropout': 0.1,

     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},

     'all_clip_norm_val': 10.0,

     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir,restart_ckpt_file)
Ejemplo n.º 25
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 64  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.n_train_tokens

    options = {
     'bidirectional': True,

     # 'char_cnn': {'activation': 'tanh',
     #  'embedding': {'dim': 4},
     #  'filters': [
     #      [1, 8],
     #      [2, 8],
     #      [3, 16],
     #      [4, 32],
     #      [5, 64],
     #  ],
     #  'max_characters_per_token': 50,
     #  'n_characters': 261,
     #  'n_highway': 1},

     'dropout': 0.1,

     'lstm': {
      'cell_clip': 3,
      'dim': 256,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 64,
      'use_skip_connections': True},

     'all_clip_norm_val': 10.0,

     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 2048,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Ejemplo n.º 26
0
def main(args):
    print('h0')
    vocab = load_vocab(args.vocab_file, 10)
    print('h1')
    batch_size = 64
    n_gpus = 3
    n_train_tokens = 4775300
    options = {
        'bidirectional': True,

        'char_cnn': {'activation': 'relu',
                     'embedding': {'dim': 16},
                     'filters': [[1, 32],
                                 [2, 32],
                                 [3, 64],
                                 [4, 128],
                                 [5, 256],
                                 [6, 512],
                                 [7, 1024]],
                     'max_characters_per_token': 10,
                     'n_characters': 105047,
                     'n_highway': 2},

        'dropout': 0.1,

        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True},

        'all_clip_norm_val': 10.0,

        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }
    inpattern = args.train_prefix
    data = BidirectionalLMDataset(inpattern, vocab, test=False,
                                  shuffle_on_load=True)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train_with_single_core(options, data, tf_save_dir, tf_log_dir)
Ejemplo n.º 27
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': args.n_epochs,
     'n_train_tokens': args.n_train_tokens,
     'batch_size': args.batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    train(options, data, args.n_gpus, args.save_dir, args.log_dir)
Ejemplo n.º 28
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)
    options = json.load(args.options_file)

    # number of tokens in training data (this for 1B Word Benchmark)

    options['n_tokens_vocab'] = vocab.size

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args):
    vocab = load_vocab(args.vocab_file, args.vocab_min_occur)
    train_tokens = 768648884 #(this for 1B Word Benchmark)
    if args.train_tokens == 'wikitext2':
        train_tokens = 2051910 *3 * 1.5 #Enwiki2 is 3x longer if split into sentences and a further 1.5 when using sentence split size of 20
    elif args.train_tokens == 'wikitext103':
        train_tokens = 101425658*3 *1.5 #wikitext-103
    options = {
     'bidirectional': True,
     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
     'dropout': 0.1,
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
     'all_clip_norm_val': 10.0,
     'n_epochs': 10,
     'n_train_tokens': train_tokens,
     'batch_size': args.train_batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=False, curriculum=True, num_steps=20) # we dont shuffle since our curriculum generator shuffles
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train_curriculum(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, args.initial_competence, args.competence_increment, args.target_batches, args.test_prefix, args.test_interval, vocab )
Ejemplo n.º 30
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
Ejemplo n.º 31
0
def top_level(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)
    vocab_file = os.path.join(args.save_dir, 'vocabs.txt')

    # load the vocab
    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(vocab_file, max_word_length)

    test_prefix = args.test_prefix

    kwargs = {
        'test': True,
        'shuffle_on_load': False,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(test_prefix, vocab, **kwargs)
    else:
        data = LMDataset(test_prefix, vocab, **kwargs)

    test(options, ckpt_file, data, batch_size=args.batch_size)
Ejemplo n.º 32
0
def main(args):
    max_token_length = args.max_token_length and int(args.max_token_length)
    print("args.vocab_file: ", args.vocab_file)
    print("max_token_length: ", max_token_length)
    print("args.stroke_vocab_file: ", args.stroke_vocab_file)

    # load the vocab
    # vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(
        args.vocab_file,
        args.stroke_vocab_file,  # Winfred stroke_vocab
        max_token_length)  # Winfred stroke_vocab

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 10731134  # 768648884

    # options = {
    #  'bidirectional': True,

    #  'char_cnn': {'activation': 'relu',
    #   'embedding': {'dim': 16},
    #   'filters': [[1, 32],
    #    [2, 32],
    #    [3, 64],
    #    [4, 128],
    #    [5, 256],
    #    [6, 512],
    #    [7, 1024]],
    #   'max_characters_per_token': max_token_length,
    #   'n_characters': 266, # 原261 + 筆畫5
    #   'n_highway': 2}, # 2

    #  'dropout': 0.1,

    #  'lstm': {
    #   'cell_clip': 3,
    #   'dim': 4096,
    #   'n_layers': 2,
    #   'proj_clip': 3,
    #   'projection_dim': 512,
    #   'use_skip_connections': True},

    #  'all_clip_norm_val': 10.0,

    #  'n_epochs': 1,
    #  'n_train_tokens': n_train_tokens,
    #  'batch_size': batch_size,
    #  'n_tokens_vocab': vocab.size,
    #  'unroll_steps': 20,
    #  'n_negative_samples_batch': 8192,
    # }

    # Add by Winfred
    option_file = os.path.join(args.save_dir, "options.json")
    with open(option_file, "r") as f:
        options = json.load(f)

    if max_token_length:
        options["char_cnn"]["max_characters_per_token"] = max_token_length
        print("Wrong max_token_length, already corrected")
    if "char_cnn" in options:
        options["char_cnn"]["n_characters"] = 266
        print("Wrong n_characters, already corrected")
    # End

    prefix = args.train_prefix
    data = BidirectionalLMDataset(
        prefix,
        vocab,
        test=False,
        shuffle_on_load=False,  # True
        do_record=args.do_record,  # Add by Winfred
        records_path=args.records_path,  # Add by Winfred
        vocab_file=args.vocab_file)  # Add by Winfred

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=args.restart_ckpt_file)
Ejemplo n.º 33
0
def main(args):
    # load the vocab
    # vocab 의 최대 길이 토큰 = 10음절 --> 자모 변환 시 30음절
    # bos char + 30 + eos char = 32
    vocab = load_vocab(args.vocab_file, 32)

    # define the options
    # batch size for each GPU
    batch_size = 64 * 2
    n_gpus = 1

    # 연애의 과학 토크나이징된 카톡 데이터 (identified_corpus_20180105) unique 토큰 개수
    # (-> unique token 개수가 아닌 전체 토큰 수를 넣어야 함)
    # n_train_tokens = 609518
    # n_train_tokens = 626932956  # 8000pair_tokenized_corpus.txt에 등하는 토큰 수 (6.2억개)
    # 임시로 사용하고 있는 토큰 수
    n_train_tokens = 200000000

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'tanh',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            32,
            'n_characters':
            62,
            'n_highway':
            2,
        },
        'dropout': 0.2,
        'lstm': {
            'cell_clip': 3,
            'dim': 256,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 256,
            'use_skip_connections': True,
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 10,
        'n_negative_samples_batch': 4096,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(filepattern=prefix,
                                  vocab=vocab,
                                  test=False,
                                  shuffle_on_load=True,
                                  with_tab=False)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(
        options,
        data,
        n_gpus,
        tf_save_dir,
        tf_log_dir,
        restart_ckpt_file=
        '/media/scatter/scatterdisk/elmo_ckpt/elmo_ckpt_0919_2142/model.ckpt_batch-625000'
    )