예제 #1
0
 def test_train_shared_softmax_no_chars(self):
     bidirectional = True
     use_chars = True
     vocab, data, options = self._get_vocab_data_options(
         bidirectional, use_chars, share_embedding_softmax=True)
     # character inputs and sharing weights not suppported
     with self.assertRaises(ValueError):
         train(options, data, 1, self.tmp_dir, self.tmp_dir)
예제 #2
0
    def test_train_bilm_chars(self):
        vocab, data, options = self._get_vocab_data_options(True, True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(True, True, True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
예제 #3
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, None)

    # define the options
    batch_size = 256  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 14273184  #  100000*16

    options = {
     'bidirectional': True,

     # 'char_cnn': {'activation': 'relu',
     #  'embedding': {'dim': 200},
     #  'filters': [[1, 32],
     #   [2, 32],
     #   [3, 64],
     #   [4, 128],
     #   [5, 256],
     #   [6, 512],
     #   [7, 1024]],
     #  'max_characters_per_token': 50,
     #  'n_characters': 261,
     #  'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 200,  # 512
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 3,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,  # 5
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix  # '../corpus_me/wd_fact_cut.txt'
    data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True)
    print('load data BidirectionalLMDataset')
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    # 加载模并训练
    train(options, data, n_gpus, tf_save_dir=tf_save_dir, tf_log_dir=tf_log_dir, restart_ckpt_file=args.save_dir)
예제 #4
0
    def test_train_skip_connections(self):
        bidirectional = True
        use_chars = False
        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars)
        options['lstm']['use_skip_connections'] = True
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(bidirectional,
                                               use_chars,
                                               test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
예제 #5
0
    def test_train_shared_softmax_embedding(self):
        bidirectional = True
        use_chars = False

        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars, share_embedding_softmax=True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(bidirectional,
                                               use_chars,
                                               test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
예제 #6
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options,
          data,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
예제 #7
0
 def test_shared_variables(self):
     vocab, data, options = self._get_vocab_data_options(True, True)
     options['n_epochs'] = 1
     train(options, data, 2, self.tmp_dir, self.tmp_dir)
     self.assertEqual(len(tf.global_variables()), 64)