def test_train_shared_softmax_no_chars(self): bidirectional = True use_chars = True vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars, share_embedding_softmax=True) # character inputs and sharing weights not suppported with self.assertRaises(ValueError): train(options, data, 1, self.tmp_dir, self.tmp_dir)
def main(args): tf_save_dir = args.save_dir tf_log_dir = args.log_dir # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 4 # batch size for each GPU n_gpus = -1 # number of tokens in training data n_train_tokens = args.size options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 16], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 1 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 1024, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 128, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 16, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def test_train_bilm_chars(self): vocab, data, options = self._get_vocab_data_options(True, True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data(True, True, True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 64 # RL batch size for each GPU n_gpus = 1 #RL # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 614901945 #RL for files 1 to 90 of shuffled corpus, according to corpus2voc options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 10, #RL 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) #50 is max word length # define the options batch_size = 200 #TODO: batch size for each GPU. n_gpus = 1 #TODO: how many gpus do you have? # number of tokens in training data n_train_tokens = 198782 #TODO: update this number to be the total number of tokens in your training data options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, #TODO: update this to how many epochs you want to run 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) restart_ckpt_file = args.restart_ckpt_file # define the options batch_size = 128 # batch size for each GPU n_gpus = 3 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 768648884 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir,restart_ckpt_file)
def main(args): if args.gpu is not None: if ',' in args.gpu: args.gpu = args.gpu.split(',') n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None if 'polyglot' in options or args.polyglot: polyglot = True vocab = load_vocab(args.vocab_files, max_word_length=max_word_length, polyglot=polyglot) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): if 'polyglot' in options or args.polyglot: data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs) else: data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, None, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 64 # batch size for each GPU n_gpus = 1 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = args.n_train_tokens options = { 'bidirectional': True, # 'char_cnn': {'activation': 'tanh', # 'embedding': {'dim': 4}, # 'filters': [ # [1, 8], # [2, 8], # [3, 16], # [4, 32], # [5, 64], # ], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 1}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 256, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 64, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 2048, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, None) # define the options batch_size = 128 # batch size for each GPU n_gpus = 1 os.environ['CUDA_VISIBLE_DEVICES'] = '0' # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 1191089 options = { 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': 50, # 'n_characters': 261, # 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 300, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def test_train_skip_connections(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars) options['lstm']['use_skip_connections'] = True train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data(bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_skip_connections(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars) options['lstm']['use_skip_connections'] = True train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data( bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def test_train_shared_softmax_embedding(self): bidirectional = True use_chars = False vocab, data, options = self._get_vocab_data_options( bidirectional, use_chars, share_embedding_softmax=True) train(options, data, 1, self.tmp_dir, self.tmp_dir) # now test tf.reset_default_graph() options, ckpt_file = load_options_latest_checkpoint(self.tmp_dir) data_test, vocab_test = self._get_data( bidirectional, use_chars, test=True) perplexity = test(options, ckpt_file, data_test, batch_size=1) self.assertTrue(perplexity < 20.0)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': args.n_epochs, 'n_train_tokens': args.n_train_tokens, 'batch_size': args.batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) train(options, data, args.n_gpus, args.save_dir, args.log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) options = json.load(args.options_file) # number of tokens in training data (this for 1B Word Benchmark) options['n_tokens_vocab'] = vocab.size prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file): kwargs = { 'test': False, 'shuffle_on_load': True, } tf.reset_default_graph() if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file) clean_checkpoint(tf_save_dir)
def main(args): vocab = load_vocab(args.vocab_file, args.vocab_min_occur) train_tokens = 768648884 #(this for 1B Word Benchmark) if args.train_tokens == 'wikitext2': train_tokens = 2051910 *1 #Enwiki2 elif args.train_tokens == 'wikitext103': train_tokens = 101425658*1 #wikitext-103 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': train_tokens, 'batch_size': args.train_batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix train_data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, train_data, args.n_gpus, tf_save_dir, tf_log_dir, converge=args.converge)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size options['all_clip_norm_val'] = 10.0 options['n_tokens_vocab'] = vocab.size options['dropout'] = 0.1 options['unroll_steps'] = 20 options['n_negative_samples_batch'] = 8192 train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size # DRO options['dro'] = args.dro options['dro_alpha'] = args.dro_alpha train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): is_load, load_path, save_path, budget = cuhk_prototype_tuner_v2.preprocess( t_id, params, args.save_dir) vocab = load_vocab(args.vocab_file, 50) batch_size = int(params['batch_size']) gpus_index_list = list( map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(','))) n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) n_train_tokens = 768648884 sess_config = tf.compat.v1.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=int( params['inter_op_parallelism_threads']), intra_op_parallelism_threads=int( params['intra_op_parallelism_threads']), graph_options=tf.compat.v1.GraphOptions( infer_shapes=params['infer_shapes'], place_pruned_graph=params['place_pruned_graph'], enable_bfloat16_sendrecv=params['enable_bfloat16_sendrecv'], optimizer_options=tf.compat.v1.OptimizerOptions( do_common_subexpression_elimination=params[ 'do_common_subexpression_elimination'], max_folded_constant_in_bytes=int( params['max_folded_constant']), do_function_inlining=params['do_function_inlining'], global_jit_level=params['global_jit_level']))) options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': int(budget), # NNI modification 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = save_path tf_log_dir = save_path if not os.path.exists(tf_save_dir): os.makedirs(tf_save_dir) if params['tf_gpu_thread_mode'] in ["global", "gpu_private", "gpu_shared"]: os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode'] if is_load: load_file = os.path.join(load_path, 'model.ckpt') start = time.time() final_perplexity = train(options, data, n_gpus, gpus_index_list, tf_save_dir, tf_log_dir, sess_config, restart_ckpt_file=load_file) end = time.time() shutil.rmtree(load_path) else: start = time.time() final_perplexity = train(options, data, n_gpus, gpus_index_list, tf_save_dir, tf_log_dir, sess_config) end = time.time() spent_time = (end - start) / 3600.0 if args.test_prefix != '': options, ckpt_file = load_options_latest_checkpoint(tf_save_dir) kwargs = { 'test': True, 'shuffle_on_load': False, } test_data = BidirectionalLMDataset(args.test_prefix, vocab, **kwargs) final_perplexity = test(options, ckpt_file, test_data, batch_size=128) report_dict = {'runtime': spent_time, 'default': final_perplexity} nni.report_final_result(report_dict)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, max_word_length=50, polyglot=True) vocab.save_vocab(args.save_dir) # define the options batch_size = 128 # batch size for each GPU if args.gpu is not None: n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 # number of tokens in training data # 768648884 (for 1B Word Benchmark) # 15442929 (for train-small) # 7769676 (for train-small English) # 7673253 (for train-small Spanish) # 138152583 (for eng+spa train/) # 57029976 (for arabic train/) # 70546273 (for english .tok train/) # 76386340 (for chineseS .tok train/) # 64928316 (for chineseT .tok train/) # 146932613 (for english+chineseS .tok train/) # 135474589 (for english+chineseT .tok train/) # 127576249 (for english + arabic .tok train/) # --------- # 108177588 (for multitask english) # 109709945 (for multitask chineseT) # 101363023 (for multitask french) # 102915840 (for multitask german) # 106180836 (for multitask italian) # 106561814 (for multitask portuguese) # 107461695 (for multitask romanian) # 100138331 (for multitask spanish) # 109527440 (for multitask swedish) # 211093428 (for multitask english+german) n_train_tokens = 107587022 options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': vocab.n_chars, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 2048, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 256, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } train_paths = args.train_paths data = BidirectionalPolyglotLMDataset(train_paths, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir if args.restore_file: restore_file = args.restore_file else: restore_file = None train(options, data, None, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=None) #change restart_ckpt_file to a checkpoint filename to continue training from that checkpoint
def test_shared_variables(self): vocab, data, options = self._get_vocab_data_options(True, True) options['n_epochs'] = 1 train(options, data, 2, self.tmp_dir, self.tmp_dir) self.assertEqual(len(tf.global_variables()), 64)
def main(args): ckpt_file = None if os.path.exists(args.save_dir + 'options.json'): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = args.n_gpus permute_number = args.permute_number # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 768648884 options = { 'bidirectional': True, 'multidirectional': True, 'permute_number': permute_number, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 267, # NOTE (lijun): add more character tokens 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': args. dim, # NOTE(feiga): halved dimensions comparing with ELMo (default=2048) 'n_layers': 2, 'proj_clip': 3, 'projection_dim': args. projection_dim, # NOTE(feiga): halved dimensions comparing with ELMo (default=256) 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = MultidirectionalLMDataset(prefix, vocab, permute_number, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir, permute_number, restart_ckpt_file=ckpt_file)
def main(args): max_token_length = args.max_token_length and int(args.max_token_length) print("args.vocab_file: ", args.vocab_file) print("max_token_length: ", max_token_length) print("args.stroke_vocab_file: ", args.stroke_vocab_file) # load the vocab # vocab = load_vocab(args.vocab_file, 50) vocab = load_vocab( args.vocab_file, args.stroke_vocab_file, # Winfred stroke_vocab max_token_length) # Winfred stroke_vocab # define the options batch_size = 128 # batch size for each GPU n_gpus = 1 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 10731134 # 768648884 # options = { # 'bidirectional': True, # 'char_cnn': {'activation': 'relu', # 'embedding': {'dim': 16}, # 'filters': [[1, 32], # [2, 32], # [3, 64], # [4, 128], # [5, 256], # [6, 512], # [7, 1024]], # 'max_characters_per_token': max_token_length, # 'n_characters': 266, # 原261 + 筆畫5 # 'n_highway': 2}, # 2 # 'dropout': 0.1, # 'lstm': { # 'cell_clip': 3, # 'dim': 4096, # 'n_layers': 2, # 'proj_clip': 3, # 'projection_dim': 512, # 'use_skip_connections': True}, # 'all_clip_norm_val': 10.0, # 'n_epochs': 1, # 'n_train_tokens': n_train_tokens, # 'batch_size': batch_size, # 'n_tokens_vocab': vocab.size, # 'unroll_steps': 20, # 'n_negative_samples_batch': 8192, # } # Add by Winfred option_file = os.path.join(args.save_dir, "options.json") with open(option_file, "r") as f: options = json.load(f) if max_token_length: options["char_cnn"]["max_characters_per_token"] = max_token_length print("Wrong max_token_length, already corrected") if "char_cnn" in options: options["char_cnn"]["n_characters"] = 266 print("Wrong n_characters, already corrected") # End prefix = args.train_prefix data = BidirectionalLMDataset( prefix, vocab, test=False, shuffle_on_load=False, # True do_record=args.do_record, # Add by Winfred records_path=args.records_path, # Add by Winfred vocab_file=args.vocab_file) # Add by Winfred tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=args.restart_ckpt_file)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50, variable=args.variable) if args.variable: vocab.save_vocab(args.save_dir) # define the options if args.batch_size > 0: batch_size = args.batch_size else: batch_size = 128 # batch size for each GPU if args.n_epochs > 0: n_epochs = args.n_epochs else: n_epochs = 10 n_gpus = 1 if args.lang == 'ga': n_train_tokens = 3573002 elif args.lang == 'mt': n_train_tokens = 1045392 elif args.lang == 'sg': n_train_tokens = 1196930 elif args.lang == 'vi': n_train_tokens = 5552361 else: raise f'Unrecognized language: {args.lang}' options = { 'bidirectional': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': vocab.n_chars if args.variable else 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': n_epochs, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab # vocab 의 최대 길이 토큰 = 10음절 --> 자모 변환 시 30음절 # bos char + 30 + eos char = 32 vocab = load_vocab(args.vocab_file, 32) # define the options # batch size for each GPU batch_size = 64 * 2 n_gpus = 1 # 연애의 과학 토크나이징된 카톡 데이터 (identified_corpus_20180105) unique 토큰 개수 # (-> unique token 개수가 아닌 전체 토큰 수를 넣어야 함) # n_train_tokens = 609518 # n_train_tokens = 626932956 # 8000pair_tokenized_corpus.txt에 등하는 토큰 수 (6.2억개) # 임시로 사용하고 있는 토큰 수 n_train_tokens = 200000000 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'tanh', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 32, 'n_characters': 62, 'n_highway': 2, }, 'dropout': 0.2, 'lstm': { 'cell_clip': 3, 'dim': 256, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 256, 'use_skip_connections': True, }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 10, 'n_negative_samples_batch': 4096, } prefix = args.train_prefix data = BidirectionalLMDataset(filepattern=prefix, vocab=vocab, test=False, shuffle_on_load=True, with_tab=False) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train( options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file= '/media/scatter/scatterdisk/elmo_ckpt/elmo_ckpt_0919_2142/model.ckpt_batch-625000' )
def main(args): is_load, load_path, save_path, budget = dfhb.preprocess( t_id, params, args.save_dir) vocab = load_vocab(args.vocab_file, 50) batch_size = int(params['batch_size']) # NNI modification if "CUDA_VISIBLE_DEVICES" in os.environ: n_gpus = len(os.environ["CUDA_VISIBLE_DEVICES"].split(',')) else: n_gpus = 1 n_train_tokens = 768648884 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': int(budget), # NNI modification 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = save_path tf_log_dir = save_path if not os.path.exists(tf_save_dir): os.makedirs(tf_save_dir) ### NNI modification ### optimizer = params['optimizer'] if 'inter_op_parallelism_threads' in params.keys(): config = get_config() if params['tf_gpu_thread_mode'] in [ "global", "gpu_private", "gpu_shared" ]: os.environ['TF_GPU_THREAD_MODE'] = params['tf_gpu_thread_mode'] if is_load: load_file = os.path.join(load_path, 'model.ckpt') start = time.time() final_perplexity = train(options, data, n_gpus, tf_save_dir, tf_log_dir, optimizer, config, restart_ckpt_file=load_file) end = time.time() shutil.rmtree(load_path) else: start = time.time() final_perplexity = train(options, data, n_gpus, tf_save_dir, tf_log_dir, optimizer, config) end = time.time() spent_time = (end - start) / 3600.0 report_dict = {'runtime': spent_time, 'default': final_perplexity} nni.report_final_result(report_dict)
def top_level(args): if not os.path.isdir(args.save_dir): os.system("mkdir %s" % args.save_dir) # define the options if args.config_file == None: args.config_file = os.path.join(current_dir, "resources/default_config.json") options = load_options(args.config_file) # load train_prefixes with open(args.train_prefix_paths, "r") as fd: train_prefixes = fd.read().split('\n') train_prefixes = [f for f in train_prefixes if f != ""] options['train_prefix_paths'] = train_prefixes # load the vocab vocab = load_vocab(args.vocab_file, 50) # number of tokens in training data (this for 1B Word Benchmark) # batch_no = n_epochs*n_train_tokens/(batch_size*unroll_steps*n_gpus) #25600 => 100 n_batch #example filtered 1330337 #1B 768648884 if args.n_train_tokens == None: options['n_train_tokens'] = get_tokens_count(args.train_prefix) else: options['n_train_tokens'] = args.n_train_tokens options['n_tokens_vocab'] = vocab.size options['milestone'] = 0 os.system("cp %s %s/vocabs.txt" % (args.vocab_file, args.save_dir)) n_gpus = options['n_gpus'] tf_save_dir = args.save_dir tf_log_dir = args.save_dir prefix = train_prefixes[0] + '/*' data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) print("options:", options) train(options, data, n_gpus, tf_save_dir, tf_log_dir) options['milestone'] = 1 save_options(options, os.path.join(args.save_dir, "options.json")) if len(train_prefixes) == 1: return options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # loop all train_prefix_paths milestone = 1 for train_prefix in train_prefixes[1:]: prefix = train_prefix + '/*' if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens else: options['n_train_tokens'] = get_tokens_count(prefix) restarter.resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file) milestone += 1 options['milestone'] = milestone save_options(options, os.path.join(args.save_dir, "options.json"))
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = args.batch_size or 128 # batch size for each GPU n_gpus = args.n_gpus or 0 gpu_list = get_available_gpus() if n_gpus <= 0: n_gpus = len(gpu_list) else: n_gpus = min([n_gpus, len(gpu_list)]) print('Work on %s GPUs' % n_gpus) # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = args.n_train_tokens or 768648884 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'transformer': { 'num_decoder_layers': 8, 'layer_preprocess': 'layer_norm', 'hidden_size': 512, 'filter_size': 2048, 'num_heads': 8, 'attention_dropout': 0.1, 'residual_dropout': 0.1, 'relu_dropout': 0.1, 'max_relative_dist': 16, 'no_additional_dropout': True }, 'use_transformer': True, 'num_context_steps': 64, 'all_clip_norm_val': 10.0, 'scale_embeddings': False, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 256, 'n_negative_samples_batch': 8192, } if args.option_file is not None: with open(args.option_file, 'r', encoding='utf-8') as reader: options = json.load(reader) options['n_train_tokens'] = n_train_tokens options['batch_size'] = batch_size options['n_tokens_vocab'] = vocab.size tf_save_dir = args.save_dir tf_log_dir = args.save_dir checkpoint = None if args.load_checkpoint: saved_options, checkpoint = try_load_options_latest_checkpoint( args.save_dir) if saved_options is not None: options = saved_options options['batch_size'] = batch_size prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=options.get( 'shuffle_training_data', True)) validation = None if args.valid_prefix is not None: validation = BidirectionalLMDataset(args.valid_prefix, vocab, test=True, shuffle_on_load=False) train(options, data, n_gpus, tf_save_dir, tf_log_dir, checkpoint, validation=validation)
'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 4096, } prefix = "data/train/train.txt" data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = "test/save/" tf_log_dir = "test/logs/" train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab vocab = load_vocab(args.vocab_files, max_word_length=50, polyglot=True) # define the options batch_size = 128 # batch size for each GPU n_gpus = 1 if args.gpu is not None: n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 # number of tokens in training data # 768648884 (for 1B Word Benchmark) # 15442929 (for train-small) # 7769676 (for train-small English) # 7673253 (for train-small Spanish) # 138152583 (for eng+spa train/) n_train_tokens = 138152583 options = { 'bidirectional': True, 'polyglot': True, 'char_cnn': {'activation': 'relu', 'embedding': {'dim': 16}, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2}, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True}, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix data = BidirectionalPolyglotLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir if args.restore_file: restore_file = args.restore_file else: restore_file = None train(options, data, None, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=restore_file) #change restart_ckpt_file to a checkpoint filename to continue training from that checkpoint
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, max_char_count_in_token=50) # define the options batch_size = 128 # batch size for each GPU n_gpus = 3 # number of tokens in training data (this for 1B Word Benchmark) #n_train_tokens = 768648884 n_train_tokens = 50 # TODO:暂时用测试集中的小数据 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [ [1, 32], # kernel_size=1的有32个 [2, 32], # kernel_size=2的有32个 [3, 64], #[4, 128], #[5, 256], #[6, 512], #[7, 1024] ], 'max_characters_per_token': 50, 'n_characters': 261, # 256 + 5(mask,unk,sos,eos,padding) 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, #'dim': 4096, 'dim': 30, # TODO 'n_layers': 2, 'proj_clip': 3, #'projection_dim': 512, 'projection_dim': 10, # TODO 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, #'n_negative_samples_batch': 8192, 'n_negative_samples_batch': 20, # TODO } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): # load the vocab #vocab = load_vocab(args.vocab_file, 50) vocab = load_vocab(args.vocab_file, 4) print('vocab.size={}'.format(vocab.size)) # define the options #batch_size = 128 # batch size for each GPU #batch_size = 256 # batch size for each GPU batch_size = 128 # batch size for each GPU n_gpus = 3 # number of tokens in training data (this for 1B Word Benchmark) #n_train_tokens = 768648884 n_train_tokens = 32119740 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [ [1, 32], [2, 32], [3, 64], [4, 128], #[5, 256], #[6, 512], #[7, 1024] ], #'max_characters_per_token': 50, 'max_characters_per_token': 4, #초성 중성 종성 'n_characters': 261, #'n_characters': 362, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, #'projection_dim': 512, 'projection_dim': 256, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, #'unroll_steps': 20, 'unroll_steps': 40, 'n_negative_samples_batch': int(4488 * 0.01), #vocab.size=4488, 1% of vocab.size } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)