Exemple #1
0
 def test_train_shared_softmax_no_chars(self):
     bidirectional = True
     use_chars = True
     vocab, data, options = self._get_vocab_data_options(
         bidirectional, use_chars, share_embedding_softmax=True)
     # character inputs and sharing weights not suppported
     with self.assertRaises(ValueError):
         train(options, data, 1, self.tmp_dir, self.tmp_dir)
Exemple #2
0
def main(args):
    tf_save_dir = args.save_dir
    tf_log_dir = args.log_dir

    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 4  # batch size for each GPU
    n_gpus = -1

    # number of tokens in training data
    n_train_tokens = args.size

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 16], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            1
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 1024,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 128,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 16,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #3
0
    def test_train_bilm_chars(self):
        vocab, data, options = self._get_vocab_data_options(True, True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(True, True, True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #4
0
    def test_train_bilm_chars(self):
        vocab, data, options = self._get_vocab_data_options(True, True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(True, True, True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #5
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 64  # RL batch size for each GPU
    n_gpus = 1  #RL

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 614901945  #RL for files 1 to 90 of shuffled corpus, according to corpus2voc

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 10,  #RL 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #6
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)  #50 is max word length

    # define the options
    batch_size = 200  #TODO: batch size for each GPU.
    n_gpus = 1  #TODO: how many gpus do you  have?

    # number of tokens in training data
    n_train_tokens = 198782  #TODO: update this number to be the total number of tokens in your training data

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,  #TODO: update this to how many epochs you want to run
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)
    restart_ckpt_file = args.restart_ckpt_file

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},

     'dropout': 0.1,

     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},

     'all_clip_norm_val': 10.0,

     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir,restart_ckpt_file)
Exemple #8
0
def main(args):

    if args.gpu is not None:
        if ',' in args.gpu:
            args.gpu = args.gpu.split(',')
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    if 'polyglot' in options or args.polyglot:
        polyglot = True
    vocab = load_vocab(args.vocab_files,
                       max_word_length=max_word_length,
                       polyglot=polyglot)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        if 'polyglot' in options or args.polyglot:
            data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs)
        else:
            data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    train(options,
          data,
          None,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
Exemple #9
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 64  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.n_train_tokens

    options = {
     'bidirectional': True,

     # 'char_cnn': {'activation': 'tanh',
     #  'embedding': {'dim': 4},
     #  'filters': [
     #      [1, 8],
     #      [2, 8],
     #      [3, 16],
     #      [4, 32],
     #      [5, 64],
     #  ],
     #  'max_characters_per_token': 50,
     #  'n_characters': 261,
     #  'n_highway': 1},

     'dropout': 0.1,

     'lstm': {
      'cell_clip': 3,
      'dim': 256,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 64,
      'use_skip_connections': True},

     'all_clip_norm_val': 10.0,

     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 2048,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #10
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, None)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 1191089

    options = {
        'bidirectional': True,

        # 'char_cnn': {'activation': 'relu',
        #  'embedding': {'dim': 16},
        #  'filters': [[1, 32],
        #   [2, 32],
        #   [3, 64],
        #   [4, 128],
        #   [5, 256],
        #   [6, 512],
        #   [7, 1024]],
        #  'max_characters_per_token': 50,
        #  'n_characters': 261,
        #  'n_highway': 2},
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 300,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #11
0
    def test_train_skip_connections(self):
        bidirectional = True
        use_chars = False
        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars)
        options['lstm']['use_skip_connections'] = True
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #12
0
    def test_train_skip_connections(self):
        bidirectional = True
        use_chars = False
        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars)
        options['lstm']['use_skip_connections'] = True
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #13
0
    def test_train_shared_softmax_embedding(self):
        bidirectional = True
        use_chars = False

        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars, share_embedding_softmax=True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #14
0
    def test_train_shared_softmax_embedding(self):
        bidirectional = True
        use_chars = False

        vocab, data, options = self._get_vocab_data_options(
            bidirectional, use_chars, share_embedding_softmax=True)
        train(options, data, 1, self.tmp_dir, self.tmp_dir)

        # now test
        tf.reset_default_graph()
        options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir)
        data_test, vocab_test = self._get_data(
            bidirectional, use_chars, test=True)
        perplexity = test(options, ckpt_file, data_test, batch_size=1)
        self.assertTrue(perplexity < 20.0)
Exemple #15
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': args.n_epochs,
     'n_train_tokens': args.n_train_tokens,
     'batch_size': args.batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    train(options, data, args.n_gpus, args.save_dir, args.log_dir)
Exemple #16
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)
    options = json.load(args.options_file)

    # number of tokens in training data (this for 1B Word Benchmark)

    options['n_tokens_vocab'] = vocab.size

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #17
0
def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file):
    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }
    tf.reset_default_graph()
    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
    clean_checkpoint(tf_save_dir)
def main(args):
    vocab = load_vocab(args.vocab_file, args.vocab_min_occur)
    train_tokens = 768648884 #(this for 1B Word Benchmark)
    if args.train_tokens == 'wikitext2':
        train_tokens = 2051910 *1 #Enwiki2
    elif args.train_tokens == 'wikitext103':
        train_tokens = 101425658*1 #wikitext-103
    options = {
     'bidirectional': True,
     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
     'dropout': 0.1,
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
     'all_clip_norm_val': 10.0,
     'n_epochs': 10,
     'n_train_tokens': train_tokens,
     'batch_size': args.train_batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, converge=args.converge)
Exemple #19
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    options['all_clip_norm_val'] = 10.0
    options['n_tokens_vocab'] = vocab.size
    options['dropout'] = 0.1
    options['unroll_steps'] = 20
    options['n_negative_samples_batch'] = 8192
    train(options,
          data,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
Exemple #20
0
def main(args):
    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    if 'char_cnn' in options:
        max_word_length = options['char_cnn']['max_characters_per_token']
    else:
        max_word_length = None
    vocab = load_vocab(args.vocab_file, max_word_length)

    prefix = args.train_prefix

    kwargs = {
        'test': False,
        'shuffle_on_load': True,
    }

    if options.get('bidirectional'):
        data = BidirectionalLMDataset(prefix, vocab, **kwargs)
    else:
        data = LMDataset(prefix, vocab, **kwargs)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    # set optional inputs
    if args.n_train_tokens > 0:
        options['n_train_tokens'] = args.n_train_tokens
    if args.n_epochs > 0:
        options['n_epochs'] = args.n_epochs
    if args.batch_size > 0:
        options['batch_size'] = args.batch_size

    # DRO
    options['dro'] = args.dro
    options['dro_alpha'] = args.dro_alpha

    train(options,
          data,
          args.n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=ckpt_file)
def main(args):
    is_load, load_path, save_path, budget = cuhk_prototype_tuner_v2.preprocess(
        t_id, params, args.save_dir)

    vocab = load_vocab(args.vocab_file, 50)

    batch_size = int(params['batch_size'])

    gpus_index_list = list(
        map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(',')))
    n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))

    n_train_tokens = 768648884

    sess_config = tf.compat.v1.ConfigProto(
        allow_soft_placement=True,
        inter_op_parallelism_threads=int(
            params['inter_op_parallelism_threads']),
        intra_op_parallelism_threads=int(
            params['intra_op_parallelism_threads']),
        graph_options=tf.compat.v1.GraphOptions(
            infer_shapes=params['infer_shapes'],
            place_pruned_graph=params['place_pruned_graph'],
            enable_bfloat16_sendrecv=params['enable_bfloat16_sendrecv'],
            optimizer_options=tf.compat.v1.OptimizerOptions(
                do_common_subexpression_elimination=params[
                    'do_common_subexpression_elimination'],
                max_folded_constant_in_bytes=int(
                    params['max_folded_constant']),
                do_function_inlining=params['do_function_inlining'],
                global_jit_level=params['global_jit_level'])))

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': int(budget),  # NNI modification
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }
    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)
    tf_save_dir = save_path
    tf_log_dir = save_path
    if not os.path.exists(tf_save_dir):
        os.makedirs(tf_save_dir)

    if params['tf_gpu_thread_mode'] in ["global", "gpu_private", "gpu_shared"]:
        os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode']
    if is_load:
        load_file = os.path.join(load_path, 'model.ckpt')
        start = time.time()
        final_perplexity = train(options,
                                 data,
                                 n_gpus,
                                 gpus_index_list,
                                 tf_save_dir,
                                 tf_log_dir,
                                 sess_config,
                                 restart_ckpt_file=load_file)
        end = time.time()
        shutil.rmtree(load_path)
    else:
        start = time.time()
        final_perplexity = train(options, data, n_gpus, gpus_index_list,
                                 tf_save_dir, tf_log_dir, sess_config)
        end = time.time()
    spent_time = (end - start) / 3600.0
    if args.test_prefix != '':
        options, ckpt_file = load_options_latest_checkpoint(tf_save_dir)
        kwargs = {
            'test': True,
            'shuffle_on_load': False,
        }
        test_data = BidirectionalLMDataset(args.test_prefix, vocab, **kwargs)
        final_perplexity = test(options, ckpt_file, test_data, batch_size=128)
    report_dict = {'runtime': spent_time, 'default': final_perplexity}
    nni.report_final_result(report_dict)
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, max_word_length=50, polyglot=True)
    vocab.save_vocab(args.save_dir)
                     
    # define the options
    batch_size = 128  # batch size for each GPU

    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    # number of tokens in training data
    #                768648884 (for 1B Word Benchmark)
    #                15442929 (for train-small)
    #                7769676 (for train-small English)
    #                7673253 (for train-small Spanish)
    #                138152583 (for eng+spa train/)
    #                57029976 (for arabic train/)
    #                70546273 (for english .tok train/)
    #                76386340 (for chineseS .tok train/)
    #                64928316 (for chineseT .tok train/)
    #               146932613 (for english+chineseS .tok train/)
    #               135474589 (for english+chineseT .tok train/)
    #               127576249 (for english + arabic .tok train/)
    #               ---------
    #               108177588 (for multitask english)
    #               109709945 (for multitask chineseT)
    #               101363023 (for multitask french)
    #               102915840 (for multitask german)
    #               106180836 (for multitask italian)
    #               106561814 (for multitask portuguese)
    #               107461695 (for multitask romanian)
    #               100138331 (for multitask spanish)
    #               109527440 (for multitask swedish)
    #               211093428 (for multitask english+german)
    n_train_tokens = 107587022 

    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': vocab.n_chars,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 2048,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 256,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    train_paths = args.train_paths
    data = BidirectionalPolyglotLMDataset(train_paths, vocab, test=False,
                                          shuffle_on_load=True)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    if args.restore_file:
        restore_file = args.restore_file
    else:
        restore_file = None

    train(options, data, None, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=None) #change restart_ckpt_file to a checkpoint filename to continue training from that checkpoint
Exemple #23
0
 def test_shared_variables(self):
     vocab, data, options = self._get_vocab_data_options(True, True)
     options['n_epochs'] = 1
     train(options, data, 2, self.tmp_dir, self.tmp_dir)
     self.assertEqual(len(tf.global_variables()), 64)
Exemple #24
0
def main(args):
    ckpt_file = None
    if os.path.exists(args.save_dir + 'options.json'):
        options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = args.n_gpus
    permute_number = args.permute_number

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 768648884

    options = {
        'bidirectional': True,
        'multidirectional': True,
        'permute_number': permute_number,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            267,  # NOTE (lijun): add more character tokens
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': args.
            dim,  # NOTE(feiga): halved dimensions comparing with ELMo (default=2048)
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': args.
            projection_dim,  # NOTE(feiga): halved dimensions comparing with ELMo (default=256)
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = MultidirectionalLMDataset(prefix,
                                     vocab,
                                     permute_number,
                                     test=False,
                                     shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          permute_number,
          restart_ckpt_file=ckpt_file)
Exemple #25
0
def main(args):
    max_token_length = args.max_token_length and int(args.max_token_length)
    print("args.vocab_file: ", args.vocab_file)
    print("max_token_length: ", max_token_length)
    print("args.stroke_vocab_file: ", args.stroke_vocab_file)

    # load the vocab
    # vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(
        args.vocab_file,
        args.stroke_vocab_file,  # Winfred stroke_vocab
        max_token_length)  # Winfred stroke_vocab

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 1

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = 10731134  # 768648884

    # options = {
    #  'bidirectional': True,

    #  'char_cnn': {'activation': 'relu',
    #   'embedding': {'dim': 16},
    #   'filters': [[1, 32],
    #    [2, 32],
    #    [3, 64],
    #    [4, 128],
    #    [5, 256],
    #    [6, 512],
    #    [7, 1024]],
    #   'max_characters_per_token': max_token_length,
    #   'n_characters': 266, # 原261 + 筆畫5
    #   'n_highway': 2}, # 2

    #  'dropout': 0.1,

    #  'lstm': {
    #   'cell_clip': 3,
    #   'dim': 4096,
    #   'n_layers': 2,
    #   'proj_clip': 3,
    #   'projection_dim': 512,
    #   'use_skip_connections': True},

    #  'all_clip_norm_val': 10.0,

    #  'n_epochs': 1,
    #  'n_train_tokens': n_train_tokens,
    #  'batch_size': batch_size,
    #  'n_tokens_vocab': vocab.size,
    #  'unroll_steps': 20,
    #  'n_negative_samples_batch': 8192,
    # }

    # Add by Winfred
    option_file = os.path.join(args.save_dir, "options.json")
    with open(option_file, "r") as f:
        options = json.load(f)

    if max_token_length:
        options["char_cnn"]["max_characters_per_token"] = max_token_length
        print("Wrong max_token_length, already corrected")
    if "char_cnn" in options:
        options["char_cnn"]["n_characters"] = 266
        print("Wrong n_characters, already corrected")
    # End

    prefix = args.train_prefix
    data = BidirectionalLMDataset(
        prefix,
        vocab,
        test=False,
        shuffle_on_load=False,  # True
        do_record=args.do_record,  # Add by Winfred
        records_path=args.records_path,  # Add by Winfred
        vocab_file=args.vocab_file)  # Add by Winfred

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          restart_ckpt_file=args.restart_ckpt_file)
Exemple #26
0
 def test_shared_variables(self):
     vocab, data, options = self._get_vocab_data_options(True, True)
     options['n_epochs'] = 1
     train(options, data, 2, self.tmp_dir, self.tmp_dir)
     self.assertEqual(len(tf.global_variables()), 64)
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50, variable=args.variable)
    if args.variable:
        vocab.save_vocab(args.save_dir)

    # define the options
    if args.batch_size > 0:
        batch_size = args.batch_size
    else:
        batch_size = 128  # batch size for each GPU

    if args.n_epochs > 0:
        n_epochs = args.n_epochs
    else:
        n_epochs = 10

    n_gpus = 1

    if args.lang == 'ga':
        n_train_tokens = 3573002
    elif args.lang == 'mt':
        n_train_tokens = 1045392
    elif args.lang == 'sg':
        n_train_tokens = 1196930
    elif args.lang == 'vi':
        n_train_tokens = 5552361
    else:
        raise f'Unrecognized language: {args.lang}'


    options = {
     'bidirectional': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': vocab.n_chars if args.variable else 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': n_epochs,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix, vocab, test=False,
                                      shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #28
0
def main(args):
    # load the vocab
    # vocab 의 최대 길이 토큰 = 10음절 --> 자모 변환 시 30음절
    # bos char + 30 + eos char = 32
    vocab = load_vocab(args.vocab_file, 32)

    # define the options
    # batch size for each GPU
    batch_size = 64 * 2
    n_gpus = 1

    # 연애의 과학 토크나이징된 카톡 데이터 (identified_corpus_20180105) unique 토큰 개수
    # (-> unique token 개수가 아닌 전체 토큰 수를 넣어야 함)
    # n_train_tokens = 609518
    # n_train_tokens = 626932956  # 8000pair_tokenized_corpus.txt에 등하는 토큰 수 (6.2억개)
    # 임시로 사용하고 있는 토큰 수
    n_train_tokens = 200000000

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'tanh',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            32,
            'n_characters':
            62,
            'n_highway':
            2,
        },
        'dropout': 0.2,
        'lstm': {
            'cell_clip': 3,
            'dim': 256,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 256,
            'use_skip_connections': True,
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 10,
        'n_negative_samples_batch': 4096,
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(filepattern=prefix,
                                  vocab=vocab,
                                  test=False,
                                  shuffle_on_load=True,
                                  with_tab=False)
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(
        options,
        data,
        n_gpus,
        tf_save_dir,
        tf_log_dir,
        restart_ckpt_file=
        '/media/scatter/scatterdisk/elmo_ckpt/elmo_ckpt_0919_2142/model.ckpt_batch-625000'
    )
Exemple #29
0
def main(args):

    is_load, load_path, save_path, budget = dfhb.preprocess(
        t_id, params, args.save_dir)

    vocab = load_vocab(args.vocab_file, 50)
    batch_size = int(params['batch_size'])  # NNI modification
    if "CUDA_VISIBLE_DEVICES" in os.environ:
        n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))
    else:
        n_gpus = 1
    n_train_tokens = 768648884
    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': int(budget),  # NNI modification
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        'n_negative_samples_batch': 8192,
    }
    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)
    tf_save_dir = save_path
    tf_log_dir = save_path
    if not os.path.exists(tf_save_dir):
        os.makedirs(tf_save_dir)
    ### NNI modification ###
    optimizer = params['optimizer']
    if 'inter_op_parallelism_threads' in params.keys():
        config = get_config()
        if params['tf_gpu_thread_mode'] in [
                "global", "gpu_private", "gpu_shared"
        ]:
            os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode']
    if is_load:
        load_file = os.path.join(load_path, 'model.ckpt')
        start = time.time()
        final_perplexity = train(options,
                                 data,
                                 n_gpus,
                                 tf_save_dir,
                                 tf_log_dir,
                                 optimizer,
                                 config,
                                 restart_ckpt_file=load_file)
        end = time.time()
        shutil.rmtree(load_path)
    else:
        start = time.time()
        final_perplexity = train(options, data, n_gpus, tf_save_dir,
                                 tf_log_dir, optimizer, config)
        end = time.time()
    spent_time = (end - start) / 3600.0
    report_dict = {'runtime': spent_time, 'default': final_perplexity}
    nni.report_final_result(report_dict)
Exemple #30
0
def top_level(args):
    if not os.path.isdir(args.save_dir):
        os.system("mkdir %s" % args.save_dir)

    # define the options
    if args.config_file == None:
        args.config_file = os.path.join(current_dir,
                                        "resources/default_config.json")
    options = load_options(args.config_file)

    # load train_prefixes
    with open(args.train_prefix_paths, "r") as fd:
        train_prefixes = fd.read().split('\n')
    train_prefixes = [f for f in train_prefixes if f != ""]
    options['train_prefix_paths'] = train_prefixes

    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # number of tokens in training data (this for 1B Word Benchmark)
    # batch_no = n_epochs*n_train_tokens/(batch_size*unroll_steps*n_gpus)
    #25600  => 100 n_batch  #example filtered 1330337  #1B 768648884
    if args.n_train_tokens == None:
        options['n_train_tokens'] = get_tokens_count(args.train_prefix)
    else:
        options['n_train_tokens'] = args.n_train_tokens

    options['n_tokens_vocab'] = vocab.size
    options['milestone'] = 0
    os.system("cp %s %s/vocabs.txt" % (args.vocab_file, args.save_dir))

    n_gpus = options['n_gpus']
    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir

    prefix = train_prefixes[0] + '/*'
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    print("options:", options)
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
    options['milestone'] = 1
    save_options(options, os.path.join(args.save_dir, "options.json"))

    if len(train_prefixes) == 1:
        return

    options, ckpt_file = load_options_latest_checkpoint(args.save_dir)

    # loop all train_prefix_paths
    milestone = 1
    for train_prefix in train_prefixes[1:]:
        prefix = train_prefix + '/*'

        if args.n_train_tokens > 0:
            options['n_train_tokens'] = args.n_train_tokens
        else:
            options['n_train_tokens'] = get_tokens_count(prefix)

        restarter.resume(options, prefix, vocab, n_gpus, tf_save_dir,
                         tf_log_dir, ckpt_file)
        milestone += 1
        options['milestone'] = milestone
        save_options(options, os.path.join(args.save_dir, "options.json"))
Exemple #31
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, 50)

    # define the options
    batch_size = args.batch_size or 128  # batch size for each GPU
    n_gpus = args.n_gpus or 0

    gpu_list = get_available_gpus()
    if n_gpus <= 0:
        n_gpus = len(gpu_list)
    else:
        n_gpus = min([n_gpus, len(gpu_list)])

    print('Work on %s GPUs' % n_gpus)

    # number of tokens in training data (this for 1B Word Benchmark)
    n_train_tokens = args.n_train_tokens or 768648884

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256],
                        [6, 512], [7, 1024]],
            'max_characters_per_token':
            50,
            'n_characters':
            261,
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            'projection_dim': 512,
            'use_skip_connections': True
        },
        'transformer': {
            'num_decoder_layers': 8,
            'layer_preprocess': 'layer_norm',
            'hidden_size': 512,
            'filter_size': 2048,
            'num_heads': 8,
            'attention_dropout': 0.1,
            'residual_dropout': 0.1,
            'relu_dropout': 0.1,
            'max_relative_dist': 16,
            'no_additional_dropout': True
        },
        'use_transformer': True,
        'num_context_steps': 64,
        'all_clip_norm_val': 10.0,
        'scale_embeddings': False,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 256,
        'n_negative_samples_batch': 8192,
    }

    if args.option_file is not None:
        with open(args.option_file, 'r', encoding='utf-8') as reader:
            options = json.load(reader)
        options['n_train_tokens'] = n_train_tokens
        options['batch_size'] = batch_size
        options['n_tokens_vocab'] = vocab.size

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    checkpoint = None
    if args.load_checkpoint:
        saved_options, checkpoint = try_load_options_latest_checkpoint(
            args.save_dir)
        if saved_options is not None:
            options = saved_options
            options['batch_size'] = batch_size

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=options.get(
                                      'shuffle_training_data', True))

    validation = None
    if args.valid_prefix is not None:
        validation = BidirectionalLMDataset(args.valid_prefix,
                                            vocab,
                                            test=True,
                                            shuffle_on_load=False)

    train(options,
          data,
          n_gpus,
          tf_save_dir,
          tf_log_dir,
          checkpoint,
          validation=validation)
Exemple #32
0
                 'n_characters': 261,
                 'n_highway': 2},

    'dropout': 0.1,

    'lstm': {
        'cell_clip': 3,
        'dim': 4096,
        'n_layers': 2,
        'proj_clip': 3,
        'projection_dim': 512,
        'use_skip_connections': True},

    'all_clip_norm_val': 10.0,

    'n_epochs': 10,
    'n_train_tokens': n_train_tokens,
    'batch_size': batch_size,
    'n_tokens_vocab': vocab.size,
    'unroll_steps': 20,
    'n_negative_samples_batch': 4096,
}

prefix = "data/train/train.txt"
data = BidirectionalLMDataset(prefix, vocab, test=False,
                              shuffle_on_load=True)

tf_save_dir = "test/save/"
tf_log_dir =  "test/logs/"
train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #33
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_files, max_word_length=50, polyglot=True)

    # define the options
    batch_size = 128  # batch size for each GPU

    n_gpus = 1
    if args.gpu is not None:
        n_gpus = len(args.gpu)
        set_gpu(args.gpu)
    else:
        n_gpus = 0

    # number of tokens in training data
    #                768648884 (for 1B Word Benchmark)
    #                15442929 (for train-small)
    #                7769676 (for train-small English)
    #                7673253 (for train-small Spanish)
    #                138152583 (for eng+spa train/)
    n_train_tokens = 138152583

    options = {
     'bidirectional': True,
     'polyglot': True,

     'char_cnn': {'activation': 'relu',
      'embedding': {'dim': 16},
      'filters': [[1, 32],
       [2, 32],
       [3, 64],
       [4, 128],
       [5, 256],
       [6, 512],
       [7, 1024]],
      'max_characters_per_token': 50,
      'n_characters': 261,
      'n_highway': 2},
    
     'dropout': 0.1,
    
     'lstm': {
      'cell_clip': 3,
      'dim': 4096,
      'n_layers': 2,
      'proj_clip': 3,
      'projection_dim': 512,
      'use_skip_connections': True},
    
     'all_clip_norm_val': 10.0,
    
     'n_epochs': 10,
     'n_train_tokens': n_train_tokens,
     'batch_size': batch_size,
     'n_tokens_vocab': vocab.size,
     'unroll_steps': 20,
     'n_negative_samples_batch': 8192,
    }

    prefix = args.train_prefix
    data = BidirectionalPolyglotLMDataset(prefix, vocab, test=False,
                                          shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    if args.restore_file:
        restore_file = args.restore_file
    else:
        restore_file = None

    train(options, data, None, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=restore_file) #change restart_ckpt_file to a checkpoint filename to continue training from that checkpoint
Exemple #34
0
def main(args):
    # load the vocab
    vocab = load_vocab(args.vocab_file, max_char_count_in_token=50)

    # define the options
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768648884
    n_train_tokens = 50  # TODO:暂时用测试集中的小数据

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation':
            'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [
                [1, 32],  # kernel_size=1的有32个
                [2, 32],  # kernel_size=2的有32个
                [3, 64],
                #[4, 128],
                #[5, 256],
                #[6, 512],
                #[7, 1024]
            ],
            'max_characters_per_token':
            50,
            'n_characters':
            261,  # 256 + 5(mask,unk,sos,eos,padding)
            'n_highway':
            2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            #'dim': 4096,
            'dim': 30,  # TODO
            'n_layers': 2,
            'proj_clip': 3,
            #'projection_dim': 512,
            'projection_dim': 10,  # TODO
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        'unroll_steps': 20,
        #'n_negative_samples_batch': 8192,
        'n_negative_samples_batch': 20,  # TODO
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)
Exemple #35
0
def main(args):
    # load the vocab
    #vocab = load_vocab(args.vocab_file, 50)
    vocab = load_vocab(args.vocab_file, 4)
    print('vocab.size={}'.format(vocab.size))

    # define the options
    #batch_size = 128  # batch size for each GPU
    #batch_size = 256  # batch size for each GPU
    batch_size = 128  # batch size for each GPU
    n_gpus = 3

    # number of tokens in training data (this for 1B Word Benchmark)
    #n_train_tokens = 768648884
    n_train_tokens = 32119740

    options = {
        'bidirectional': True,
        'char_cnn': {
            'activation': 'relu',
            'embedding': {
                'dim': 16
            },
            'filters': [
                [1, 32],
                [2, 32],
                [3, 64],
                [4, 128],
                #[5, 256],
                #[6, 512],
                #[7, 1024]
            ],
            #'max_characters_per_token': 50,
            'max_characters_per_token': 4,  #초성 중성 종성
            'n_characters': 261,
            #'n_characters': 362,
            'n_highway': 2
        },
        'dropout': 0.1,
        'lstm': {
            'cell_clip': 3,
            'dim': 4096,
            'n_layers': 2,
            'proj_clip': 3,
            #'projection_dim': 512,
            'projection_dim': 256,
            'use_skip_connections': True
        },
        'all_clip_norm_val': 10.0,
        'n_epochs': 10,
        'n_train_tokens': n_train_tokens,
        'batch_size': batch_size,
        'n_tokens_vocab': vocab.size,
        #'unroll_steps': 20,
        'unroll_steps': 40,
        'n_negative_samples_batch':
        int(4488 * 0.01),  #vocab.size=4488, 1% of vocab.size
    }

    prefix = args.train_prefix
    data = BidirectionalLMDataset(prefix,
                                  vocab,
                                  test=False,
                                  shuffle_on_load=True)

    tf_save_dir = args.save_dir
    tf_log_dir = args.save_dir
    train(options, data, n_gpus, tf_save_dir, tf_log_dir)