def _get_data(self, bidirectional, use_chars, test=False): vocab_file = os.path.join(FIXTURES, 'vocab.txt') if use_chars: vocab = load_vocab(vocab_file, 10) else: vocab = load_vocab(vocab_file, None) prefix = os.path.join(FIXTURES, 'data.txt') if bidirectional: data = BidirectionalLMDataset(prefix, vocab, test=test) else: data = LMDataset(prefix, vocab, test=test, reverse=False) return data, vocab
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def main(args): # load the vocab #todo 1.0 加载vocabulary数据集 vocab = load_vocab(args.vocab_file, 50) # define the options batch_size = 128 # batch size for each GPU n_gpus = 3 # number of tokens in training data (this for 1B Word Benchmark) n_train_tokens = 768648884 options = { 'bidirectional': True, 'char_cnn': { 'activation': 'relu', 'embedding': { 'dim': 16 }, 'filters': [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], 'max_characters_per_token': 50, 'n_characters': 261, 'n_highway': 2 }, 'dropout': 0.1, 'lstm': { 'cell_clip': 3, 'dim': 4096, 'n_layers': 2, 'proj_clip': 3, 'projection_dim': 512, 'use_skip_connections': True }, 'all_clip_norm_val': 10.0, 'n_epochs': 10, 'n_train_tokens': n_train_tokens, 'batch_size': batch_size, 'n_tokens_vocab': vocab.size, 'unroll_steps': 20, 'n_negative_samples_batch': 8192, } prefix = args.train_prefix #todo 2.0 构造数据集 data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): # load the vocab vocab = load_vocab(args.vocab_file, args.maxchar) # define the options batch_size = 192 # batch size for each GPU n_gpus = 2 # number of tokens in training data n_train_tokens = args.size options = { "bidirectional": True, "char_cnn": { "activation": "relu", "embedding": { "dim": 16 }, "filters": [ [1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024], ], "max_characters_per_token": args.maxchar, "n_characters": 261, "n_highway": 2, }, "dropout": 0.1, "lstm": { "cell_clip": 3, "dim": 2048, "n_layers": 2, "proj_clip": 3, "projection_dim": 512, "use_skip_connections": True, }, "all_clip_norm_val": 10.0, "n_epochs": args.epochs, "n_train_tokens": n_train_tokens, "batch_size": batch_size, "n_tokens_vocab": vocab.size, "unroll_steps": 20, "n_negative_samples_batch": 4096, } prefix = args.train_prefix data = BidirectionalLMDataset(prefix, vocab, test=False, shuffle_on_load=True) tf_save_dir = args.save_dir tf_log_dir = args.save_dir train(options, data, n_gpus, tf_save_dir, tf_log_dir)