def main(args): print(args) print('-' * 100) print('Loading models and options...') options, ckpt_file = load_options_latest_checkpoint(args.save_dir) print('Loading vocabulary...') # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) shards = glob(args.test_prefix) shards.sort() # print(shards) kwargs = { 'test': True, 'shuffle_on_load': False, } print(f'Building dataset...') datasets = [] for shard in shards: if options.get('bidirectional'): datasets.append(BidirectionalLMDataset(shard, vocab, **kwargs)) else: datasets.append(LMDataset(shard, vocab, **kwargs)) print('Predicting...') tag(options, ckpt_file, shards, datasets, batch_size=args.batch_size) print('-' * 100) print('done.')
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None # vocab = load_vocab(args.vocab_file, max_word_length) vocab = load_vocab(args.vocab_file, args.stroke_vocab_file, 50) # Winfred stroke_vocab test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } permute_number = options.get('permute_number', 4) if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) elif options.get('multidirectional'): data = MultidirectionalLMDataset(test_prefix, vocab, permute_number, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_siz, permute_number=permute_number)
def main(args): if args.gpu is not None: n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): if options.get('polyglot'): data = BidirectionalPolyglotLMDataset(test_prefix, vocab, **kwargs) else: data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) #ipy.embed() test(options, ckpt_file, data, batch_size=args.batch_size)
def main(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(args.vocab_file, max_word_length) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def test_original_dataset_implementation(): """Trying to show how the original `LMDataset` and `BidirectionalLMDataset` works. """ from bilm.data import LMDataset, BidirectionalLMDataset, \ UnicodeCharsVocabulary test_prefix = 'data/test/violin_test.txt' vocab_path = 'dump/bilm_pretrain/vocab-2016-09-10.txt' vocabulary = UnicodeCharsVocabulary(vocab_path, max_word_length=50) dataset = LMDataset(test_prefix, vocabulary) a = dataset.iter_batches(batch_size=10, num_steps=50) b = next(a) print(f'Keys: {b.keys()}') for k, v in b.items(): print(f'Shape of {k}: {v.shape}') print(vocabulary.decode(b['token_ids'][0])) print(vocabulary.decode(b['next_token_id'][0])) print(vocabulary.decode_chars(b['tokens_characters'][0])) from IPython import embed embed() import os os._exit(1)
def main(args): if args.gpu is not None: if ',' in args.gpu: args.gpu = args.gpu.split(',') n_gpus = len(args.gpu) set_gpu(args.gpu) else: n_gpus = 0 options, ckpt_file = load_options_latest_checkpoint(args.save_dir) if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None if 'polyglot' in options or args.polyglot: polyglot = True vocab = load_vocab(args.vocab_files, max_word_length=max_word_length, polyglot=polyglot) prefix = args.train_prefix kwargs = { 'test': False, 'shuffle_on_load': True, } if options.get('bidirectional'): if 'polyglot' in options or args.polyglot: data = BidirectionalPolyglotLMDataset(prefix, vocab, **kwargs) else: data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) tf_save_dir = args.save_dir tf_log_dir = args.save_dir # set optional inputs if args.n_train_tokens > 0: options['n_train_tokens'] = args.n_train_tokens if args.n_epochs > 0: options['n_epochs'] = args.n_epochs if args.batch_size > 0: options['batch_size'] = args.batch_size train(options, data, None, args.n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file)
def _load_data(self, reverse, chars, bidirectional=False): if chars: vocab = UnicodeCharsVocabulary(self._tmp_vocab, 5) else: vocab = Vocabulary(self._tmp_vocab) if not bidirectional: data = LMDataset(self._tmp_train, vocab, reverse=reverse) else: data = BidirectionalLMDataset(self._tmp_train, vocab) return data
def _get_data(self, bidirectional, use_chars, test=False): vocab_file = os.path.join(FIXTURES, 'vocab.txt') if use_chars: vocab = load_vocab(vocab_file, 10) else: vocab = load_vocab(vocab_file, None) prefix = os.path.join(FIXTURES, 'data.txt') if bidirectional: data = BidirectionalLMDataset(prefix, vocab, test=test) else: data = LMDataset(prefix, vocab, test=test, reverse=False) return data, vocab
def resume(options, prefix, vocab, n_gpus, tf_save_dir, tf_log_dir, ckpt_file): kwargs = { 'test': False, 'shuffle_on_load': True, } tf.reset_default_graph() if options.get('bidirectional'): data = BidirectionalLMDataset(prefix, vocab, **kwargs) else: data = LMDataset(prefix, vocab, **kwargs) train(options, data, n_gpus, tf_save_dir, tf_log_dir, restart_ckpt_file=ckpt_file) clean_checkpoint(tf_save_dir)
def top_level(args): options, ckpt_file = load_options_latest_checkpoint(args.save_dir) vocab_file = os.path.join(args.save_dir, 'vocabs.txt') # load the vocab if 'char_cnn' in options: max_word_length = options['char_cnn']['max_characters_per_token'] else: max_word_length = None vocab = load_vocab(vocab_file, max_word_length) test_prefix = args.test_prefix kwargs = { 'test': True, 'shuffle_on_load': False, } if options.get('bidirectional'): data = BidirectionalLMDataset(test_prefix, vocab, **kwargs) else: data = LMDataset(test_prefix, vocab, **kwargs) test(options, ckpt_file, data, batch_size=args.batch_size)
words, vocab_unicodechars.word_to_char_ids(words))) print('====> word \t{}\t encoded chars id result: {}'.format( words, vocab_unicodechars.encode_chars(words))) ids = [1234, 3234, 22, 34, 341324, 21, 345] print('====> decode \t{}\t to words: {}'.format( ids, vocab_unicodechars.decode(ids))) ''' UE for LMDataset ''' print('\n\n\tUE for LMDataset:') vocab_file = '../data/vocab_seg_words_elmo.txt' vocab_unicodechars = UnicodeCharsVocabulary(vocab_file, max_word_length=10, validate_file=True) filepattern = '../data/example/*_seg_words.txt' lmds = LMDataset(filepattern, vocab_unicodechars, test=True) batch_size = 128 n_gpus = 1 unroll_steps = 10 data_gen = lmds.iter_batches(batch_size * n_gpus, unroll_steps) jump_cnt = 0 for num, batch in enumerate(data_gen, start=1): jump_cnt += 1 if jump_cnt > 10: break print('====> iter [{}]\ttoken ids shape: {}'.format( num, batch['token_ids'].shape)) print('====> iter [{}]\ttokens characters shape: {}'.format( num, batch['tokens_characters'].shape)) print('====> iter [{}]\tnext token ids shape: {}'.format( num, batch['next_token_id'].shape))
if args.model == 'hmm': res0, res1, res2, OOV, IN = get_perplexities(sents, model, k=k) res_perplexities2.append(res2) count_in += OOV count_oov += IN res_perplexities0.append(res0) res_perplexities1.append(res1) if args.model == 'elmo': filepath = subdir + os.sep if options.get('bidirectional'): data = BidirectionalLMDataset(filepath, vocab, **kwargs) # print(data) else: data = LMDataset(filepath, vocab, **kwargs) res2 = test(options, ckpt_file, data, batch_size=args.batch_size) res_perplexities2.append(res2) outfile.write(file + '\t' + label + '\t' + str(res2) + '\n') if count % 5 == 0: print('I have calculated perplexities for %s files' % count, file=sys.stderr) print('=== Just a sanity check on the perplexity calculations: ') print(labels[:5], fns[:5], res_perplexities2[:5]) print('Texts with the most extreme text-level perplexities:')